# Importing Necessary Modules

In [13]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from mpl_toolkits.mplot3d import Axes3D
sb.set()

# Reading CSV File

In [14]:
fashiondata_raw = pd.read_csv('fashion_data_2018_2022.csv')
fashiondata_raw.head()

Unnamed: 0,product_id,product_name,gender,category,pattern,color,age_group,season,price,material,sales_count,reviews_count,average_rating,out_of_stock_times,brand,discount,last_stock_date,wish_list_count,month_of_sale,year_of_sale
0,1001,Biker Jacket,Male,Shirt,Geometric,White,25-35,Spring,70.36,Synthetic,75,65,4.9,3,ZARA,20%,2018/1/28,211,1,2018
1,1002,Business Shirt,Male,Jacket,Polka Dots,Beige,18-24,Summer,91.59,Wool,296,25,3.5,6,ZARA,0%,2018/1/28,211,1,2018
2,1003,Wool Jacket,Female,Dress,Geometric,Brown,18-24,All,129.52,Wool,50,32,4.3,4,ZARA,5%,2018/1/7,64,1,2018
3,1004,Summer Dress,Male,Shorts,Plain,White,25-35,All,116.01,Cotton,435,73,4.6,3,ZARA,10%,2018/1/1,171,1,2018
4,1005,Casual Jeans,Male,Shirt,Plain,Beige,35-45,Winter,125.48,Viscose,79,36,3.6,4,ZARA,0%,2018/1/28,126,1,2018


# Extracting Necessary Data

In [15]:
fashiondata_extracted = pd.DataFrame(fashiondata_raw[["gender", "color", "price", "material", "year_of_sale", "age_group", "season", "category", "sales_count", "average_rating"]])
display(fashiondata_extracted)

Unnamed: 0,gender,color,price,material,year_of_sale,age_group,season,category,sales_count,average_rating
0,Male,White,70.36,Synthetic,2018,25-35,Spring,Shirt,75,4.9
1,Male,Beige,91.59,Wool,2018,18-24,Summer,Jacket,296,3.5
2,Female,Brown,129.52,Wool,2018,18-24,All,Dress,50,4.3
3,Male,White,116.01,Cotton,2018,25-35,All,Shorts,435,4.6
4,Male,Beige,125.48,Viscose,2018,35-45,Winter,Shirt,79,3.6
...,...,...,...,...,...,...,...,...,...,...
655,Female,Green,48.45,Wool,2022,25-35,Summer,Jacket,328,5.0
656,Female,White,61.52,Cotton,2022,18-24,Winter,Dress,200,4.7
657,Female,White,31.07,Synthetic,2022,35-45,Autumn,Shoes,478,4.0
658,Male,Red,139.69,Cotton Blend,2022,18-24,Winter,Shirt,125,4.8


# Statistical Representation of Numeric Predictors

In [16]:
fashiondata_extracted[['price','year_of_sale', 'sales_count', 'average_rating']].describe().round(2)

Unnamed: 0,price,year_of_sale,sales_count,average_rating
count,660.0,660.0,660.0,660.0
mean,84.97,2020.0,265.55,4.02
std,37.62,1.42,132.33,0.55
min,20.25,2018.0,50.0,3.0
25%,53.2,2019.0,149.0,3.6
50%,82.04,2020.0,257.0,4.0
75%,117.77,2021.0,378.5,4.5
max,149.93,2022.0,500.0,5.0


# Overview of Categorical Predictors

In [17]:
fashiondata_extracted[['gender', 'age_group', 'color','material', 'season', 'category']].describe()

Unnamed: 0,gender,age_group,color,material,season,category
count,660,660,660,660,660,660
unique,2,3,12,7,5,8
top,Female,18-24,White,Leather,All,Skirt
freq,331,226,174,104,149,99


# Grouping Data into a Pivot Table

In [18]:
# Groups the data and sums the sales count for each combination
df = fashiondata_extracted.groupby(['year_of_sale', 'season', 'category']).sum()

# Forms a pivot table with grouped data
pivoted = pd.pivot_table(df, values='sales_count', index=['year_of_sale', 'season'], columns=['category'])

# Convert the pivot table to integer type
pivoted = pivoted.fillna(0).astype(int)

display(pivoted)

Unnamed: 0_level_0,category,Blouse,Dress,Jacket,Jeans,Shirt,Shoes,Shorts,Skirt
year_of_sale,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018,All,1337,445,670,1094,691,1393,548,1199
2018,Autumn,0,558,715,529,1573,1033,234,699
2018,Spring,2168,466,345,0,743,452,1121,1841
2018,Summer,0,705,1544,1328,0,994,1521,977
2018,Winter,979,940,1233,1695,830,668,1424,683
2019,All,0,1551,892,100,177,1908,959,1723
2019,Autumn,666,152,625,1408,717,1110,1786,1671
2019,Spring,229,1187,656,1178,610,1390,921,205
2019,Summer,0,893,65,0,857,320,1325,666
2019,Winter,2192,1183,673,1109,122,1653,550,1366


# Summary Statistics for Sales Count

In [19]:
pivoted.describe().round(2)

category,Blouse,Dress,Jacket,Jeans,Shirt,Shoes,Shorts,Skirt
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,801.28,841.84,769.16,880.88,826.48,931.68,900.44,1058.84
std,704.43,408.67,553.58,653.93,481.09,490.5,444.48,550.83
min,0.0,146.0,0.0,0.0,0.0,0.0,174.0,0.0
25%,229.0,466.0,549.0,318.0,610.0,532.0,583.0,683.0
50%,709.0,940.0,656.0,913.0,726.0,961.0,850.0,1140.0
75%,1240.0,1175.0,892.0,1300.0,1302.0,1133.0,1325.0,1511.0
max,2192.0,1551.0,2580.0,2659.0,1717.0,1908.0,1786.0,1841.0


# Pivot Table for Total Sales

In [20]:
# Sums the sales count for each categories in each season for each year
summed_sales = pivoted.sum(axis=1)

# Create a new DataFrame with the summed sales count
finaldataset = pd.DataFrame(summed_sales, columns=['total_sales'])

# Display the table
display(finaldataset)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_sales
year_of_sale,season,Unnamed: 2_level_1
2018,All,7377
2018,Autumn,5341
2018,Spring,7136
2018,Summer,7069
2018,Winter,8452
2019,All,7310
2019,Autumn,8135
2019,Spring,6376
2019,Summer,4126
2019,Winter,8848


# Summary Statistics for Total Sales Count

In [21]:
finaldataset.describe().round(2)

Unnamed: 0,total_sales
count,25.0
mean,7010.6
std,1431.35
min,4126.0
25%,5892.0
50%,7136.0
75%,8081.0
max,9997.0


# Summary Statistics for Total Sales Count for Each Year

In [23]:
for x in range (2018, 2023):
    print(f"\nYear {x}:")
    display(yearly_data[x].describe().round(2))


Year 2018:


Unnamed: 0,total_sales
count,5.0
mean,7075.0
std,1117.79
min,5341.0
25%,7069.0
50%,7136.0
75%,7377.0
max,8452.0



Year 2019:


Unnamed: 0,total_sales
count,5.0
mean,6959.0
std,1833.05
min,4126.0
25%,6376.0
50%,7310.0
75%,8135.0
max,8848.0



Year 2020:


Unnamed: 0,total_sales
count,5.0
mean,7130.0
std,1110.55
min,5892.0
25%,6503.0
50%,6669.0
75%,8081.0
max,8505.0



Year 2021:


Unnamed: 0,total_sales
count,5.0
mean,6913.0
std,1901.53
min,5022.0
25%,5840.0
50%,6523.0
75%,7183.0
max,9997.0



Year 2022:


Unnamed: 0,total_sales
count,5.0
mean,6976.0
std,1671.56
min,4707.0
25%,5850.0
50%,7529.0
75%,7941.0
max,8853.0
