In [None]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('CustomerData.csv')
df.head()

#Handle missing values
df.isnull().sum()
df.gender.fillna(df.gender.mode()[0], inplace = True)
df.category.fillna(df.category.mode()[0], inplace = True)
df.age.fillna(int(df.age.mean()), inplace = True)
df['annual income (lakhs)'].fillna(df['annual income (lakhs)'].mean(), inplace = True)
df.isnull().sum()

#Encode categorical values to numerical values
encoder = preprocessing.LabelEncoder()
df[['category', 'purchase type ', 'gender']] = df[['category', 'purchase type ', 'gender']].apply(encoder.fit_transform)
df.head()

#Binning ages into child, teenager, adult, middle aged, old aged
df['age group'] = pd.cut(x = df['age'], bins = [0, 12, 19, 30, 50, 100], labels = ["child", "teenager", "adult", "middle aged", "old aged"], include_lowest = True)
df.head()

#Applying MinMax scaler on spending scores (reduces values between 0-1)
scaler = MinMaxScaler()
df[['spending score', 'items purchased (monthly)', 'annual income (lakhs)']] = scaler.fit_transform(df[['spending score', 'items purchased (monthly)', 'annual income (lakhs)']])
df.head()


In [None]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('CustomerData.csv')
sns.histplot(data = df["age"])

sns.boxplot(x = df['age'])

#Removing outliers in age as person with age < 17 does not have a stable earning
df.drop(df[df['age'] < 17].index, inplace = True)

sns.histplot(data = df["age"])

sns.scatterplot(data = df, x = df.age, y = df['annual income (lakhs)'])

In [None]:
df1 = pd.read_csv('association_analysis.csv')
df1.head()

df1.drop(['tid'], axis = 1, inplace = True)
# !pip install apyori
from apyori import apriori
#Converting dataframe to a list of lists containing items

records = []
for i in range(len(df1)):
    record = []
    for j in range(len(df1.columns)):
        if df1.values[i, j]:
            record.append(df1.columns[j])
    records.append(record)

records[:3]

min_sup = 0.03
min_confidence = 0.7

#Apriori
rules = apriori(records, min_support = min_sup, min_confidence = min_confidence)
rules = list(rules)
rules[0]

for rule in rules:
    items = [i for i in rule[0]]
    print("Rule : ",items, "Support :", rule[1], "Confidence : ", rule[2][0][2])


In [None]:
#FP Growth

#!pip install pyfpgrowth
import pyfpgrowth

import pandas as pd
df1 = pd.read_csv('association_analysis.csv')
df1.head()

df.drop(['tid'], axis = 1, inplace = True)

records = []
for i in range(len(df1)):
    record = []
    for j in range(len(df1.columns)):
        if df1.values[i, j]:
            record.append(df1.columns[j])
    records.append(record)

records[:3]

itemsets = pyfpgrowth.find_frequent_patterns(records, 0.03)
itemsets

pyfpgrowth.generate_association_rules(itemsets, 0.7)


In [None]:
# Decision Tree
import pandas as pd
from sklearn import datasets
import numpy as np

irisdata = pd.DataFrame(datasets.load_iris().data, columns = datasets.load_iris().feature_names )
irisdata

iristarget = pd.DataFrame(datasets.load_iris().target)
iristarget

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(irisdata, iristarget, test_size=0.4,random_state=42)

from sklearn import tree
model = tree.DecisionTreeClassifier()

iris_model = model.fit(X_train, y_train)

pred = iris_model.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, iris_model.predict(X_test))

from sklearn import tree
tree.plot_tree(iris_model)

In [None]:
# Random Forest
import pandas as pd
from sklearn import datasets

irisdata = pd.DataFrame(datasets.load_iris().data, columns = datasets.load_iris().feature_names)

iristarget = pd.DataFrame(datasets.load_iris().target)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(irisdata, iristarget, test_size=0.4, random_state=42)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

clf = model.fit(X_train, y_train)
pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

In [None]:
#KMeans
import pandas as pd
from sklearn import datasets

x = pd.DataFrame(datasets.load_iris().data, columns = datasets.load_iris().feature_names)
y = pd.DataFrame(datasets.load_iris().target, columns = ['Targets'])

from sklearn.cluster import KMeans

model = KMeans(n_clusters=3, random_state=0)
model.fit(X)

import numpy as np
color_map = np.array(['red','green','blue'])
import matplotlib.pyplot as plt
plt.scatter(x['petal length (cm)'], x['petal width (cm)'], c = color_map[y.Targets], s=40)

In [None]:
#dbscan
import pandas as pd
from sklearn import datasets

x = pd.DataFrame(datasets.load_iris().data, columns = datasets.load_iris().feature_names)
y = pd.DataFrame(datasets.load_iris().target, columns = ['Targets'])

from sklearn.cluster import DBSCAN

Model = DBSCAN(eps=0.1, min_samples=3)
Model.fit(x)

import numpy as np
color_map = np.array(['red','green','blue'])
import matplotlib.pyplot as plt
plt.scatter(x['petal length (cm)'], x['petal width (cm)'], c = color_map[y.Targets], s=40)