In [None]:
pip install plotly

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
train.shape,test.shape

In [None]:
User_id,Product_ID, TARGET_COL = 'User_ID','Product_ID','Purchase'
features = [c for c in train.columns if c not in [User_id,Product_ID, TARGET_COL]]

cat_cols = ['Gender','Age','City_Category','Stay_In_Current_City_Years']

num_cols = [c for c in features if c not in cat_cols]

In [None]:
train.info()

In [None]:
test.info()

In [None]:
"Gender"].value_counts().plot.pie(title="Gender", explode=(0.1, 0), labels=["Male", "Female"], autopct="%1.1f%%", shadow=True)

In [None]:

t1_box = go.Box(
                name = 'Product_Category_1',
                y = train.Product_Category_1,
                marker = dict(color = 'rgba(160,160,50,0.7)')
)

t2_box = go.Box(
                name = 'Product_Category_2',
                y = train.Product_Category_2,
                marker = dict(color = 'rgba(50,160,150,0.7)')
)

t3_box = go.Box(
                name = 'Product_Category_3',
                y = train.Product_Category_3,
                marker = dict(color = 'rgba(160,60,150,0.7)')
)

fig_box = [t1_box, t2_box, t3_box]

iplot(fig_box)

In [None]:
sns.countplot(train["Age"], hue=train["Gender"]).set_title("Age&Gender")
sns.despine()

In [None]:
train.groupby(["Occupation"]).mean()[["Product_Category_1", "Product_Category_2", "Product_Category_3"]].plot.line(title="Gender&Product")
sns.despine()

In [None]:
#  Distribution of the target variable: Purchase
sns.distplot(train.Purchase, bins = 25)

In [None]:
plt.figure(figsize=(16,5))
print("Skew: {}".format(train['Purchase'].skew()))
print("Kurtosis: {}".format(train['Purchase'].kurtosis()))
ax = sns.kdeplot(train['Purchase'],shade=True,color='g')
plt.xticks([i for i in range(0,1200,50)])
plt.show()

In [None]:
numeric_features = train.select_dtypes(include=[np.number])
numeric_features.dtypes

In [None]:
fig = px.histogram(train, x="Product_Category_1", color="Age", title='Histogram')
fig.show()

In [None]:
fig = px.density_heatmap(train, x="Purchase", y="Product_Category_1", marginal_x="histogram", marginal_y="histogram", title='Histogram')
fig.show()

In [None]:
#Distribution of the variable Occupation
sns.countplot(train.Occupation)

In [None]:
# Distribution of the variable Marital_Status
sns.countplot(train.Marital_Status)

In [None]:
#Distribution of the variable Product_Category_1
sns.countplot(train.Product_Category_1)

In [None]:
sns.countplot(train.Product_Category_2)

In [None]:
sns.countplot(train.Product_Category_3)

In [None]:
fig = px.scatter_matrix(train,dimensions=["Product_Category_1", "Product_Category_2", "Product_Category_3"],
    color="Purchase")
fig.show()

In [None]:
corr = numeric_features.corr()
#correlation matrix
f, ax = plt.subplots(figsize=(15, 6))
sns.heatmap(corr, vmax=.8,annot_kws={'size': 20}, annot=True);

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
le = preprocessing.LabelEncoder()
cat_col = train.select_dtypes(include="object").columns.tolist()
for i in cat_col:
    train.loc[:, i] = le.fit_transform(train.loc[:, i])
    
fig, ax = plt.subplots(figsize=(15, 12))
sns.heatmap(train.drop("User_ID", axis=1).corr(), annot=True, cmap="YlGnBu").set_title("Correlation between features")

In [None]:
cat_features = train.select_dtypes(include=[np.object])
cat_features.dtypes

In [None]:
#Distribution of the variable Gender
sns.countplot(train.Gender)

In [None]:
# Distribution of the variable Age
sns.countplot(train.Age)

In [None]:
#Distribution of the variable City_Category
sns.countplot(train.City_Category)

In [None]:
##Distribution of the variable Stay_In_Current_City_Years
sns.countplot(train.Stay_In_Current_City_Years)

In [None]:
#Occupation and Purchase analysis
Occupation_pivot = train.pivot_table(index='Occupation', values="Purchase", aggfunc=np.mean)
Occupation_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Occupation")
plt.ylabel("Purchase")
plt.title("Occupation and Purchase Analysis")

In [None]:
## Marital_Statusand Purchase analysis
martial_pivot = train.pivot_table(index='Marital_Status', values="Purchase", aggfunc=np.mean)
martial_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Maretial Status")
plt.ylabel("Purchase")
plt.title("Maretial and Purchase Analysis")

In [None]:
## Product_category_1and Purchase analysis
Product_category_1_pivot = train.pivot_table(index='Product_Category_1', values="Purchase", aggfunc=np.mean)
Product_category_1_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Product_Category_1")
plt.ylabel("Purchase")
plt.title("Product_Category_1 and Purchase Analysis")

In [None]:
## Gender and Purchase analysis
gender_pivot = train.pivot_table(index='Gender', values="Purchase", aggfunc=np.mean)
gender_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Gender")
plt.ylabel("Purchase")
plt.title("Gender and Purchase Analysis")

In [None]:
##  Age and Purchase analysis
age_pivot = train.pivot_table(index='Age', values="Purchase", aggfunc=np.mean)
age_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("Gender")
plt.ylabel("Purchase")
plt.title("Gender and Purchase Analysis")

In [None]:
## City_Category and Purchase analysis
City_Category_pivot = train.pivot_table(index='City_Category', values="Purchase", aggfunc=np.mean)
City_Category_pivot.plot(kind='bar', color='blue',figsize=(10,5))
plt.xlabel("City_Category")
plt.ylabel("Purchase")
plt.title("Gender and Purchase Analysis")

In [None]:
# Join Train and Test Dataset
train['source']='train'
test['source']='test'
data = pd.concat([train,test], ignore_index = True, sort = False)
print(train.shape, test.shape, data.shape)

In [None]:
data['Product_Category_2'] = data['Product_Category_2'].fillna(data['Product_Category_2'].median())
data['Product_Category_3'] = data['Product_Category_3'].fillna(data['Product_Category_3'].median())

In [None]:
data.isnull().sum()/data.shape[0]*100

In [None]:
cat_features = data.select_dtypes(include=[np.object])
cat_features.head()

In [None]:
#Converting Gender to binary
#Turn gender binary
data['Gender'] = data['Gender'].replace(("M", "F"),(0,1))

In [None]:
#city_category
data['City_Category'] = data['City_Category'].replace(("A", "B", "C"),(0,1,2))

In [None]:
#Stay_In_Current_City_Years
data['Stay_In_Current_City_Years']= data['Stay_In_Current_City_Years'].replace(("0", "1", "2","3","4+"),
                                                (0,1,2,3,4))

In [None]:
data['Age'].value_counts()

In [None]:
#Stay_In_Current_City_Years
data['Age']= data['Age'].replace(("26-35", "36-45", "18-25","46-50","51-55","55+","0-17"),
                                                (2,3,1,4,5,6,0))

In [None]:
data = data.drop(['User_ID', 'Product_ID'], axis = 1)

In [None]:
#Divide into test and train:
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']
#Drop unnecessary columns:
test.drop(['source'],axis=1,inplace=True)
train.drop(['source'],axis=1,inplace=True)

In [None]:
train_df = train.copy()
test_df = test.copy()

In [None]:
test_df =test_df.drop(['Purchase'], axis = 1)

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
# lets split the target data from the train data

y = train_df['Purchase']
X = train_df.drop(['Purchase'], axis = 1)
x_test = test_df

# lets print the shapes of these newly formed data sets
print("Shape of the x :", X.shape)
print("Shape of the y :", y.shape)
print("Shape of the test data :", x_test.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
# lets print the shapes again 
print("Shape of the X Train :", X_train.shape)
print("Shape of the y Train :", y_train.shape)
print("Shape of the X test :", X_test.shape)
print("Shape of the y test :", y_test.shape)
print("Shape of the test data :", x_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
x_test = sc.transform(x_test)

In [None]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from math import sqrt

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(n_jobs=10)
lr.fit(X_train,y_train)

y_test_pred_rfr = lr.predict(X_test)

print("RMSE : " , np.sqrt(mean_squared_error(y_test,y_test_pred_rfr)))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(max_depth=8, min_samples_leaf=150)
rfr.fit(X_train,y_train)

y_pred_rfr = rfr.predict(X_test)

print("Training Accuracy :", rfr.score(X_train, y_train))
print("Testing Accuracy :", rfr.score(X_test, y_test))

rms_rf = sqrt(mean_squared_error(y_test, y_pred_rfr))
print("The Rmse value For RandomForest is ",rms_rf)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

print("Training Accuracy :", dt.score(X_train, y_train))
print("Testing Accuracy :", dt.score(X_test, y_test))

rms_dt = sqrt(mean_squared_error(y_test, y_pred_dt))
print("The Rmse value For Decission Tree is ",rms_dt)