In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sma
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
dataset = pd.read_csv("MCD.csv")

In [None]:
dataset.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
#Changing the name of some columns
df = df.rename(columns={'Annual Income (k$)': 'Annual_income', 'Spending Score (1-100)': 'Spending_score'})

In [None]:
#Replacing objects for numerical values
df['Gender'].replace(['Female','Male'], [0,1],inplace=True)

# Univariate Analysis , Bivariate , Multivariate

In [None]:
df_Gender=df.loc[df['Gender']=='Female']


In [None]:
plt.plot(df_Gender['Age'], np.zeros_like(df_Gender['Age']))
plt.xlabel('Spending Score(1-100)')
plt.show()

In [None]:
#Checking values have been replaced properly
df.Gender

In [None]:
#Density estimation of values using distplot
plt.figure(1 , figsize = (15 , 6))
feature_list = ['Age','Annual_income', "Spending_score"]
feature_listt = ['Age','Annual_income', "Spending_score"]
pos = 1 
for i in feature_list:
    plt.subplot(1 , 3 , pos)
    plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
    sns.distplot(df[i], bins=20, kde = True)
    pos = pos + 1
plt.show()

In [None]:
#Count and plot gender
sns.countplot(y = 'Gender', data = df, palette="husl", hue = "Gender")
df["Gender"].value_counts()

In [None]:
#Pairplot with variables we want to study
sns.pairplot(df, vars=["Age", "Annual_income", "Spending_score"],  kind ="reg", hue = "Gender", palette="husl", markers = ['o','D'])

# Statistic Analysis

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df['Annual_income'].value_counts().to_frame()

# Missing Values

In [None]:
df[df.duplicated()]

In [None]:
df.isna().sum()

# Replace the outliers

In [None]:
for i in df:
    if df[i].dtype=='int64' or df[i].dtypes=='float64':
        q1=df[i].quantile(0.25)
        q3=df[i].quantile(0.75)
        iqr=q3-q1
        upper=q3+1.5*iqr
        lower=q1-1.5*iqr
        df[i]=np.where(df[i] >upper, upper, df[i])
        df[i]=np.where(df[i] <lower, lower, df[i])

In [None]:
import matplotlib.pyplot as mtp

In [None]:
def box_scatter(df, x, y):    
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(16,6))
    sns.boxplot(df=df, x=x, ax=ax1)
    sns.scatterplot(df=df, x=x,y=y,ax=ax2)
for i in df:
    if df[i].dtype=='int64' or df[i].dtypes=='float64':
        mtp.boxplot(df[i])
        mtp.show()

# Encoding

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df['Age']=encoder.fit_transform(df['Age'])
df.head()

In [None]:
x=df.iloc[:,:-1]
x.head()

# Independent Variable Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x=scaler.fit_transform(x)

# Cluster data

In [None]:
#Creating values for the elbow
from sklearn.cluster import KMeans
X = df.loc[:,["Age", "Annual_income", "Spending_score"]]
inertia = []
k = range(1,20)
for i in k:
    means_k = KMeans(n_clusters=i, random_state=0)
    means_k.fit(X)
    inertia.append(means_k.inertia_)

In [None]:
#Plotting the elbow
plt.plot(k , inertia , 'bo-')
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()

# Clustering

In [None]:
#Training kmeans with 5 clusters
means_k = KMeans(n_clusters=5, random_state=0)
means_k.fit(X)
labels = means_k.labels_
centroids = means_k.cluster_centers_

In [None]:
pip install plotly

In [None]:
#Create a 3d plot to view the data sepparation made by Kmeans





import plotly.express as px
import plotly.graph_objs as go



trace1 = go.Scatter3d(
    x= X['Spending_score'],
    y= X['Annual_income'],
    z= X['Age'],
    mode='markers',
     marker=dict(
        color = labels, 
        size= 10,
        line=dict(
            color= labels,
        ),
        opacity = 0.9
     )
)
layout = go.Layout(
    title= 'Clusters',
    scene = dict(
            xaxis = dict(title  = 'Spending_score'),
            yaxis = dict(title  = 'Annual_income'),
            zaxis = dict(title  = 'Age')
        )
)
fig = go.Figure(data=trace1, layout=layout)
py.offline.iplot(fig)

# Splitting Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn import datasets 
from sklearn.model_selection import train_test_split

digits=datasets.load_digits()

x=digits.data
print();print(X.shape)

y=digits.target
print();print(Y.shape)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

print(); print(x_train.shape)
print(); print(x_test.shape)
print(); print(y_train.shape)
print(); print(y_test.shape)

# Build the model

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg=RandomForestRegressor()

# Train the model

In [None]:
reg.fit(x_train,y_train)

# Test the model

In [None]:
y_pred=reg.predict(x_test)

# Performance mesurement

In [None]:
from sklearn.metrics import mean_squared_error
import math
print(math.sqrt(mean_squared_error(y_test,y_pred)))