# Avocado is a fruit consumed by people heavily in the United States. So we have to predict the average price of the single avacado.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("Avacado.csv")

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
df.columns

# Problem Definition





This data was downloaded from the Hass Avocado Board website in May of 2018 & compiled into a single CSV. The table below represents weekly 2018 retail scan data for National retail volume (units) and price. Retail scan data comes directly from retailers’ cash registers based on actual retail sales of Hass avocados. Starting in 2013, the table below reflects an expanded, multi-outlet retail data set. Multi-outlet reporting includes an aggregation of the following channels: grocery, mass, club, drug, dollar and military. The Average Price (of avocados) in the table reflects a per unit (per avocado) cost, even when multiple units (avocados) are sold in bags. The Product Lookup codes (PLU’s) in the table are only for Hass avocados. Other varieties of avocados (e.g. greenskins) are not included in this table.

In [None]:
df.tail()

In [None]:
df1=df.iloc[0:1517]

In [None]:
df1

In [None]:
df1.shape

#Removed the empty rows which contains nan values in all the columns and save the new data in the df1 variable

In [None]:
pd.set_option('display.max_rows',None)
#to check and visvalize the maximum data

In [None]:
df1["Unnamed: 0"].value_counts()

In [None]:
df1.drop('Unnamed: 0',axis=1,inplace=True)

The Feature "Unnamed:0" is just a representation of the indexes, so it's useless to keep it. So removed the same column

In [None]:
#Now we will perform EDA on date column and region column as all the other column are numeric data type.

In [None]:
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(12,8))
ax = df1.groupby('region').AveragePrice.count().plot.bar(ylim=0)
ax.set_ylabel('AveragePrice')
plt.show()

In [None]:
#Charlotte has the maximum average price of avacados sold among all the regions

In [None]:
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(15,10))
ax = df1.groupby('Date').AveragePrice.count().plot.bar(ylim=0)
ax.set_ylabel('AveragePrice')
plt.show()

In [None]:
#Date column compared with the average price 

In [None]:
df1['Date']=pd.to_datetime(df1['Date'])
df1['Month']=df1['Date'].apply(lambda x:x.month)
df1['Day']=df1['Date'].apply(lambda x:x.day)

In [None]:
df1.head()

seprated the date column to day and month as year column was already here

In [None]:
df1.drop('Date',axis=1,inplace=True)

removed the date column as the data of "Date" column has been extracted

In [None]:
df1.info()

now we will work on the type and region column as both are of object datatype

In [None]:
df1["type"].value_counts()

In [None]:
df1.drop('type',axis=1,inplace=True)

Dropped the type column as it has single unique value

In [None]:
df1.region.unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()
df1["region"]=LE.fit_transform(df1["region"])

In [None]:
df1.region.unique()

Applied label encoder on the region column to convert it into the int or float datatype.

In [None]:
df1.info()

The data has no object datatype

In [None]:
df1.loc[df1['XLarge Bags']==0.0]

In [None]:
df1["XLarge Bags"].value_counts()

In [None]:
df1["XLarge Bags"].unique

In [None]:
sns.distplot(df1["XLarge Bags"],kde=True)

#There are no null values in the XLarge Bags column but there are "0" which are considered are no data and that data to be treated so we made the distribution plot to see wethere the column is normally distributed or not so we found that the graph is right skewed so we will replace the data with median.

In [None]:
df1["XLarge Bags"].median()

In [None]:
df1["XLarge Bags"].mean()

The median of the XLarge Bags is coming as zero  so we will try mean and then after replacing the 0 values with mean value will check the distribution of the column if not correct then we will drop the column

In [None]:
df1["XLarge Bags"]=df1["XLarge Bags"].replace(0.00,2181.77)

In [None]:
df1["XLarge Bags"].value_counts()

In [None]:
sns.distplot(df1["XLarge Bags"],kde=True)

In [None]:
#we have replaced the column xlarge bags with mean values

In [None]:
df1["Total Bags"].value_counts()

In [None]:
df1["Small Bags"].value_counts()

In [None]:
df1["Large Bags"].value_counts()

#we have found that from columns total bags, large bags, small bags the large bag column contains 0 values which should be treated. So now treating the values of large columns.

In [None]:
sns.distplot(df1["Large Bags"],kde=True)

In [None]:
df1["Large Bags"].median()

In [None]:
df1["Large Bags"]=df1["Large Bags"].replace(0.00,5044.35)

In [None]:
df1["Large Bags"].value_counts()

In [None]:
sns.distplot(df1["Large Bags"],kde=True)

In [None]:
df1.info()

In [None]:
df1["AveragePrice"].value_counts()

In [None]:
df1["Total Volume"].value_counts()

In [None]:
df1["4046"].value_counts() 

In [None]:
df1["4225"].value_counts()

In [None]:
df1["4770"].value_counts()

In [None]:
df1["year"].value_counts()

In [None]:
df1["Month"].value_counts()

In [None]:
df1["Day"].value_counts()

In [None]:
#Checked all the columns there is no 0 values

In [None]:
df1.isnull().sum()

There is no null values in the dataset

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1.AveragePrice, kde=False, fit=norm)

In [None]:
#Average price graph to see the distribution of graph and it is evenly distributed

In [None]:
dfcor=df1.corr()

In [None]:
dfcor

correlation beteen the columns

In [None]:
sns.heatmap(dfcor)

year column and region column are negatively correlated

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(dfcor,cmap="Purples",annot=True)

#we can check the correlation with values.All the darker columns and lighter column represents the highest coorelation between the graphs

In [None]:
plt.figure(figsize=(22,7))
df1.corr()["AveragePrice"].sort_values(ascending=False).drop(["AveragePrice"]).plot(kind="bar",color="c")
plt.xlabel("Feature",fontsize=14)
plt.ylabel("column with target names",fontsize=14)
plt.title("correlation",fontsize=18)
plt.show()

In [None]:
#CORRELTION OF THE FEATURE COLUMNS 

In [None]:
df1.columns

In [None]:
#univariate analysis
df1["AveragePrice"].plot.box()

In [None]:
df1["Total Volume"].plot.box()

In [None]:
df1["4046"].plot.box()

In [None]:
df1["4225"].plot.box()

In [None]:
df1["4770"].plot.box()

In [None]:
df1["Total Bags"].plot.box()

In [None]:
df1["Small Bags"].plot.box()

In [None]:
df1["Large Bags"].plot.box()

In [None]:
df1["XLarge Bags"].plot.box()

In [None]:
df1["Month"].plot.box()

In [None]:
df1["Day"].plot.box()

In [None]:
df1["year"].plot.box()

Now plottig distribution plots to check the skewness of the columns

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1.year, kde=False, fit=norm)

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1["Total Volume"], kde=False, fit=norm)

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1["4046"], kde=False, fit=norm)

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1["4225"], kde=False, fit=norm)

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1["4770"], kde=False, fit=norm)

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1["Total Bags"], kde=False, fit=norm)

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1["Small Bags"], kde=False, fit=norm)

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1["Large Bags"], kde=False, fit=norm)

In [None]:
sns.set(font_scale=1.5) 
from scipy.stats import norm
fig, ax = plt.subplots(figsize=(15, 9))
sns.distplot(a=df1["XLarge Bags"], kde=False, fit=norm)

In [None]:
#All the above distributed plots are right skewed plots

In [None]:
#Bivariate analysis

In [None]:
df1.columns

In [None]:
byDate=df.groupby('Date').mean()
plt.figure(figsize=(12,8))
byDate['AveragePrice'].plot()
plt.title('Average Price')

In [None]:
#The average price and the mean of the date column is shown by graph

In [None]:
plt.scatter(df1["AveragePrice"],df1["Total Volume"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["4046"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["4225"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["4770"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["Total Bags"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["Small Bags"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["Large Bags"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["XLarge Bags"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["year"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["region"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["Month"])

In [None]:
plt.scatter(df1["AveragePrice"],df1["Day"])

Now we will work on data like removing skewness and outliers and scaling the data 

In [None]:
df1.skew()

In [None]:
#There is skewness present in 'Total Volume', '4046', '4225', '4770', 'Total Bags','Small Bags', 'Large Bags', 'XLarge Bags', 'year'

In [None]:
df1["Total Volume"]=np.log(df1["Total Volume"])

In [None]:
#Tries other methods as well but the best skewness removal method was log tranforation only

In [None]:
df1.skew()

In [None]:
df1["4046"]=np.log(df1["4046"])
df1["4225"]=np.log(df1["4225"])

In [None]:
df1.skew()

In [None]:
df1["4770"]=np.cbrt(df1["4770"])

In [None]:
df1["Total Bags"]=np.log(df1["Total Bags"])
df1["Small Bags"]=np.log(df1["Small Bags"])
df1["Large Bags"]=np.log(df1["Large Bags"])
df1["XLarge Bags"]=np.log(df1["XLarge Bags"])

In [None]:
df1.skew()

In [None]:
#Tried to remove as much skewness can be removed

Now as we removed the skewness now again we will check the outliers through boxplot


In [None]:
col=["Total Volume","4046","4225","4770","Total Bags","Small Bags","Large Bags","XLarge Bags"]

for i in col:
    plt.figure()
    df[i].plot.box()

now we will treat the outliers

In [None]:
#graph analysis
sns.pairplot(df1)

In [None]:
from scipy.stats import zscore


In [None]:
df2=df1[["Total Volume","4046","4225","4770","Total Bags","Small Bags","Large Bags","XLarge Bags"]]
z=np.abs(zscore(df2))
df_new=df1[(z<3).all(axis=1)]

In [None]:
print("shape before and after")
print("shape before".ljust(20),":",df1.shape)
print("shape after".ljust(20),":",df_new.shape)
print("pecentage loss".ljust(20),":",(df1.shape[0]-df_new.shape[0])/df1.shape[0])

In [None]:
#2% of data loss is there while removing outliers with zscore

In [None]:
q1=df2.quantile(0.25)
q3=df2.quantile(0.75)
IQR=q3-q1

In [None]:
df_new1=df1[~((df2<(q1-1.5*IQR)) |(df2>(q3+1.5*IQR))).any(axis=1)]

In [None]:
print("shape before and after")
print("shape before".ljust(20),":",df1.shape)
print("shape after".ljust(20),":",df_new1.shape)
print("pecentage loss".ljust(20),":",(df1.shape[0]-df_new1.shape[0])/df1.shape[0])

In [None]:
#there is 26 % data loss while removing outliers with IQR so we will go through the zscore only

In [None]:
df_new.columns

In [None]:
x=df_new.drop("AveragePrice",axis=1)
y=df_new["AveragePrice"]

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

X=sc.fit_transform(x)

X=pd.DataFrame(X,columns=x.columns)

In [None]:
#scalled the input data with standard scaler

Now we will perform the model on the data

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
for i in range(0,100):
    x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=i)
    lr.fit(x_train,y_train)
    pred_train=lr.predict(x_train)
    pred_test=lr.predict(x_test)
    print(f"At random state {i},the training accuracy is:- {r2_score(y_train,pred_train)}")
    print(f"At random state {i},the testing accuracy is:- {r2_score(y_test,pred_test)}")

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
print("seprated the training and testing data")

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=4)
lr.fit(x_train,y_train)

In [None]:
pred_test=lr.predict(x_test)

In [None]:
print(r2_score(y_test,pred_test))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import Lasso

parameters = {"alpha":[.0001,.001,.01,.1,1,10],"random_state":list(range(0,10))}
ls=Lasso()
clf = GridSearchCV(ls,parameters)
clf.fit(x_train,y_train)

print(clf.best_params_)

In [None]:
ls=Lasso(alpha=.0001,random_state=0)
ls.fit(x_train,y_train)
ls.score(x_train,y_train)
pred_ls=ls.predict(x_test)

lss=r2_score(y_test,pred_ls)
lss

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
parameters = {"criterion":["mse","mae"],"max_features":["auto","sqrt","log2"]}
rf=RandomForestRegressor()
clf= GridSearchCV(rf,parameters)
clf.fit(x_train,y_train)

print(clf.best_params_)

In [None]:
rf=RandomForestRegressor(criterion="mse",max_features="log2")
rf.fit(x_train,y_train)
rf.score(x_train,y_train)
pred_decision=rf.predict(x_test)

rfs=r2_score(y_test,pred_decision)
print("R2 score:",rfs*100)


In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, pred_decision))
print('MSE:', metrics.mean_squared_error(y_test, pred_decision))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred_decision)))

In [None]:
# RandomForestRegressor is giving the best accuracy 

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x=y_test,y=pred_decision,color="r")
plt.plot(y_test,y_test,color="b")
plt.xlabel("actual",fontsize=14)
plt.ylabel("predicted",fontsize=14)
plt.title("RandomForestRegressor",fontsize=18)
plt.show()

In [None]:
#graph to depict the performance of the model

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr=DecisionTreeRegressor()
dtr.fit(x_train,y_train)
dtr.score(x_train,y_train)
pred_decision1=dtr.predict(x_test)

dtrs=r2_score(y_test,pred_decision1)
print("R2 score:",rfs*100)


In [None]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, pred_decision1))
print('MSE:', metrics.mean_squared_error(y_te st, pred_decision1))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred_decision1)))

In [None]:
#RandomForestRegressor is giving the best accuracy and also less MSE, MAE AND RMSE

In [None]:
import joblib
joblib.dump(rf,"Avacado_Final.obj")
print("object of the dataset has been created")    

In [None]:
loaded_model=joblib.load(open("Avacado_Final.obj","rb"))
result=loaded_model.score(x_test,y_test)
print(result)

In [None]:
conclusion = pd.DataFrame([loaded_model.predict(x_test)[:],pred_decision[:]],index=("Predicted","Original"))

In [None]:
conclusion

In [None]:
#predicted values verses original values