In [None]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from rapidfuzz import process,fuzz
import unidecode
import matplotlib
matplotlib.use("TkAgg")   # or "Qt5Agg" if you have PyQt installed
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [None]:
data=pd.read_csv('Data.csv')

In [None]:
#cleaning process

#1) chech for missing values and calculating the percentage of missing values per column
print(data.isnull().sum())
print((data.isnull().sum()/len(data))*100)

#data is not having any missing values


In [None]:
#2) check for duplicated values
data.duplicated().any() # returns True if there are any duplicated rows, False otherwise
data.duplicated().sum() # returns the number of duplicated rows
data[data.duplicated()] # returns the duplicated rows

#data is not having any duplicated values

In [75]:
#3) Handle inconsistent data entries
#check if the columns is in the right data type

print(data.dtypes)
# we have Date as object we need to convert it to datetime64

Transaction ID               int64
Date                datetime64[ns]
Product Category            object
Product Name                object
Units Sold                   int64
Unit Price                 float64
Total Revenue              float64
Region                      object
Payment Method              object
dtype: object


In [None]:
object_col=data[['Product Name','Date','Product Category','Region','Payment Method']]
for col in object_col:
    data[col].str.strip() #this removes the space at the beginning and end of the string
data.to_csv('Data.csv', index=False)

In [None]:
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True, errors='coerce')

In [None]:
#Normalize categorical values

data['Date'].value_counts()     

In [None]:
data['Product Category'].value_counts()

In [None]:
choices = data['Product Name'].unique()
similar_pairs=[]
for product in choices:
    matches = process.extract(product, choices, limit=None, scorer=fuzz.partial_ratio)
    for match, score,index in matches:
        if score >=75 and product !=match:
            pair=tuple(sorted([product,match]))
            if pair not in similar_pairs:
                similar_pairs.append((pair[0],pair[1],score)) 

for p1,p2,score in similar_pairs:
    print(f"'{p1}' and '{p2}' have a similarity score of {score}") 

data['Product Name'].replace('MacBook Pro 16-inch','Apple MacBook Pro 16-inch',inplace=True)
data['Product Name'].replace('Nike Air Force 1','Nike Air Force 1 Sneakers',inplace=True)
data['Product Name'].replace('Adidas Ultraboost Shoes','Adidas Ultraboost Running Shoes',inplace=True)
data['Product Name'].replace('Yeti Rambler Tumbler','Yeti Rambler 20 oz Tumbler',inplace=True)
data.to_csv('Data.csv', index=False)

In [None]:
data['Region'].value_counts()

In [None]:
data['Payment Method'].value_counts()

In [None]:
#check for non-ASCII characters in 'Product Name' column
import unicodedata


mask=data['Product Name'].apply(lambda x: not all(ord(c) <128 for c in str(x))) 
#this returns a boolean series where True indicates the presence of non-ASCII characters
data[mask]

#function to normalize text by removing accents and special characters
def normalize_text(text): 
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8') 
data['Product Name'] = data['Product Name'].apply(normalize_text)
data.to_csv('Data.csv', index=False)


In [None]:
#check for invalid values like -ve values or unreasonable values in the numerical columns
invalid_unitSolds=data[data['Units Sold']<0]
invalid_unitPrice=data[data['Unit Price']<0]
invalid_totalRevenue=data[data['Total Revenue']<0]

print(invalid_unitSolds)
print(invalid_unitPrice)
print(invalid_totalRevenue)    

In [None]:
#4) Handling outliers

numeric_cols = data[['Units Sold', 'Unit Price', 'Total Revenue']] 

#printing the outliers
for col in numeric_cols:
    Q1=data[col].quantile(0.25)
    Q3=data[col].quantile(0.75)
    IQR=Q3-Q1
    lower=Q1 - 1.5 * IQR
    upper=Q3 + 1.5 * IQR
    outliers=data[(data[col]<lower) | (data[col]>upper)]
    #here we gave the data[col] col now is year and is now a series
    #the series is undergoing a condition it will check if it's true and retrun a series of boolean
    #now outlier is a dataframe that contains all the rows that have outliers in that specific column
    # if not outliers.empty:
    #     print(f"\nColumn '{col}' has outliers:")
    #     print(outliers[[col]])


#using the box plot to visualize the outliers
fig1,ax1=plt.subplots()
fig2,ax2=plt.subplots()
fig3,ax3=plt.subplots()
ax1.boxplot(data['Units Sold'],vert=False,labels=['Units Sold'])
ax1.set_title('Box plot for Units Sold')
ax1.set_xlabel('No. of Units Sold')


ax2.boxplot(data['Unit Price'],vert=False,labels=['Unit Price'])
ax2.set_title('Box plot for Unit Price')
ax2.set_xlabel('USD$')

ax3.boxplot(data['Total Revenue'],vert=False,labels=['Total Revenue'])
ax3.set_title('Box plot for Total Revenue')
ax3.set_xlabel('USD$')

plt.tight_layout()
plt.show()

#there is no outliers in the dataset

In [None]:
#Transforming the data to log scale to reduce the effect of outliers
data['Unit Price']=np.log1p(data['Unit Price'])
data['Total Revenue']=np.log1p(data['Total Revenue'])

In [None]:

#visualizing the data after log transformation
fig2,ax2=plt.subplots()
fig3,ax3=plt.subplots()

ax2.boxplot(data['Unit Price'],vert=False,labels=['Unit Price'])
ax2.set_title('Box plot for Unit Price')
ax2.set_xlabel('USD$')

ax3.boxplot(data['Total Revenue'],vert=False,labels=['Total Revenue'])
ax3.set_title('Box plot for Total Revenue')
ax3.set_xlabel('USD$')

plt.tight_layout()
plt.show()

In [80]:
data_encoded=pd.get_dummies(data,columns=['Region','Product Category','Payment Method'])
bool_cols = data_encoded.select_dtypes(include='bool').columns
data_encoded[bool_cols] = data_encoded[bool_cols].astype(int)
print(data_encoded.dtypes)

Transaction ID                               int64
Date                                datetime64[ns]
Product Name                                object
Units Sold                                   int64
Unit Price                                 float64
Total Revenue                              float64
Region_Asia                                  int64
Region_Europe                                int64
Region_North America                         int64
Product Category_Beauty Products             int64
Product Category_Books                       int64
Product Category_Clothing                    int64
Product Category_Electronics                 int64
Product Category_Home Appliances             int64
Product Category_Sports                      int64
Payment Method_Credit Card                   int64
Payment Method_Debit Card                    int64
Payment Method_PayPal                        int64
dtype: object


In [81]:
data_encoded=data_encoded.drop(['Product Name','Transaction ID'],axis=1)
print(data_encoded.head())

        Date  Units Sold  Unit Price  Total Revenue  Region_Asia  \
0 2024-01-01           2    6.908745       7.601392            0   
1 2024-01-02           1    6.216586       6.216586            0   
2 2024-01-03           3    4.262539       5.351716            1   
3 2024-01-04           4    2.832625       4.173772            0   
4 2024-01-05           1    4.510750       4.510750            0   

   Region_Europe  Region_North America  Product Category_Beauty Products  \
0              0                     1                                 0   
1              1                     0                                 0   
2              0                     0                                 0   
3              0                     1                                 0   
4              1                     0                                 1   

   Product Category_Books  Product Category_Clothing  \
0                       0                          0   
1                     

In [83]:
X=data_encoded.select_dtypes(include='number')
print(X.head())
kmeans=KMeans(n_clusters=3,random_state=123)
data_encoded['Cluster']=kmeans.fit_predict(X)



# Check average feature values per cluster

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot clusters
plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=data_encoded['Cluster'], cmap='viridis', s=50)
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('K-Means Clusters (2D PCA)')
plt.colorbar(label='Cluster')
plt.show()
#print(data_encoded.head())
data_encoded.to_csv('Cleaned_Data.csv', index=False)

   Units Sold  Unit Price  Total Revenue  Region_Asia  Region_Europe  \
0           2    6.908745       7.601392            0              0   
1           1    6.216586       6.216586            0              1   
2           3    4.262539       5.351716            1              0   
3           4    2.832625       4.173772            0              0   
4           1    4.510750       4.510750            0              1   

   Region_North America  Product Category_Beauty Products  \
0                     1                                 0   
1                     0                                 0   
2                     0                                 0   
3                     1                                 0   
4                     0                                 1   

   Product Category_Books  Product Category_Clothing  \
0                       0                          0   
1                       0                          0   
2                       0       



PermissionError: [Errno 13] Permission denied: 'Cleaned_Data.csv'