Classification Model

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(r"C:\Users\John\projects\Personal_Projects\online_retail_II.csv")
df.head()

In [None]:
print(df.shape)
df.isna().sum()

In [None]:
df.dropna(inplace=True)
print(df.shape)
df.isna().sum()

In [None]:
df = df[df["Quantity"] > 0]
df.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
def make_year(dateyear):
    return dateyear[:4]

def make_month(dateyear):
    return dateyear[5:7]



df["Year"] = df["InvoiceDate"].apply(make_year)
df["Month"] = df["InvoiceDate"].apply(make_month)
df["Total Sales"] = df["Price"]*df["Quantity"]
df.head()

In [None]:
country_counts = df['Country'].value_counts()[:5]
country_counts.plot(kind="bar", figsize=(10,5))
plt.xlabel('Country')
plt.ylabel('Frequency')
plt.title('Distribution of Orders by Country')
plt.xticks(rotation=45)

In [None]:
description = df["Description"].value_counts().head(10)
description.plot(kind="bar", figsize=(10,5))
plt.xlabel('Customer')
plt.ylabel('Frequency')
plt.title('Distribution of Orders by Customer')

In [None]:
months_date = df[["Month"]].value_counts().sort_index(ascending=True)
months_date.plot(kind="bar", figsize=(10,5))
plt.xlabel('Month')
plt.ylabel('Frequency')
plt.title('Distribution of Orders by Customer')

In [None]:
sales_value = df.groupby("Customer ID")["Total Sales"].sum().sort_index(ascending=True).reset_index()
sales_value

In [None]:
frequency = df["Customer ID"].value_counts().sort_index(ascending=True).reset_index()
frequency

In [None]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])  
recent_date = df["InvoiceDate"].max() + pd.Timedelta(days=1)

In [None]:
cur_date = df.groupby("Customer ID")["InvoiceDate"].max().reset_index()

cur_date["last_purchase"] = (recent_date-cur_date["InvoiceDate"]).dt.days
recency = cur_date.groupby("Customer ID")["last_purchase"].min().sort_index(ascending=True).reset_index()
recency

In [None]:
rf = pd.merge(recency, frequency, on="Customer ID")

new_data = pd.merge(rf, sales_value, on="Customer ID")

new_data.head()

In [None]:
def add_label(data):
    if data["last_purchase"] < 30 and data["count"] > 10 and data["Total Sales"] > 1000:
        return "High_value"
    elif (data["last_purchase"] < 60) and (data["count"] > 5):
        return "Medium_value"
    else:
        return "Low_value"

new_data["Segment"] = new_data.apply(add_label, axis=1)
new_data.head()

In [None]:
plt.figure(figsize=(12,8))

plt.subplot(4,3,1)
plt.scatter(x=new_data["Customer ID"], y=new_data["Total Sales"])
plt.subplot(4,3,2)
plt.scatter(x=new_data["Customer ID"], y=new_data["last_purchase"])
plt.subplot(4,3,3)
plt.scatter(x=new_data["Customer ID"], y=new_data["count"])


In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

le = LabelEncoder()

y = le.fit_transform(new_data["Segment"])

In [None]:
scale = StandardScaler()

x = scale.fit_transform(new_data[["last_purchase", "count", "Total Sales"]])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=47)

knc = KNeighborsClassifier(n_neighbors=3)
knc.fit(x_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score

y_pred = knc.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
#print(confusion_matrix(y_test,y_pred ))

In [None]:
new_data

In [None]:
x1 = new_data[["last_purchase", "count", "Total Sales"]]
x1 = scale.fit_transform(x1[["last_purchase", "count", "Total Sales"]])

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

km = KMeans(n_clusters=3, init="k-means++", random_state=47)
km.fit(x)
x_pred = km.predict(x)

silhouette_score(x, x_pred)

In [None]:
wccs = []

for i in range(1,11):
    km = KMeans(n_clusters=i, init="k-means++", random_state=47)
    km.fit(x)
    wccs.append(km.inertia_)
wccs

In [None]:
import seaborn as sns

sns.lineplot(x=range(1,11), y=wccs, marker="o", color="red")

In [None]:
km = KMeans(n_clusters=3, init="k-means++", random_state=47)
km.fit(x)
x_pred = km.predict(x)

In [None]:
plt.figure(figsize=(12,8))

plt.subplot(4,3,1)
plt.scatter(x=new_data["count"], y=new_data["Total Sales"], c=x_pred)
plt.subplot(4,3,2)
plt.scatter(x=new_data["count"], y=new_data["last_purchase"], c=x_pred)
plt.subplot(4,3,3)
plt.scatter(x=new_data["Total Sales"], y=new_data["last_purchase"], c=x_pred)