In [90]:
import pandas as pd

# Retail Customer Prediction

We are providing a model to predict the product type a customer is buying based off other available attributes such as gender, income, country, and such. Predicting which product a customer is likely to buy based off these inputs will then allow for more personalized ads and more effective marketing by identifying target audiences. Then in turn these predictions can also be used in grocery stores as they anticpate how much to stock certain items based off certain trends related to local demographics.

In [91]:
data = pd.read_csv('retail_data.csv')

In [92]:
print (data.head())
print (data.columns)
print (data['Date'])

   Transaction_ID  Customer_ID                 Name                Email  \
0       8691788.0      37249.0  Michelle Harrington    Ebony39@gmail.com   
1       2174773.0      69749.0          Kelsey Hill     Mark36@gmail.com   
2       6679610.0      30192.0         Scott Jensen    Shane85@gmail.com   
3       7232460.0      62101.0        Joseph Miller     Mary34@gmail.com   
4       4983775.0      27901.0        Debra Coleman  Charles30@gmail.com   

          Phone                      Address        City            State  \
0  1.414787e+09            3959 Amanda Burgs    Dortmund           Berlin   
1  6.852900e+09           82072 Dawn Centers  Nottingham          England   
2  8.362160e+09            4133 Young Canyon     Geelong  New South Wales   
3  2.776752e+09  8148 Thomas Creek Suite 100    Edmonton          Ontario   
4  9.098268e+09    5813 Lori Ports Suite 269     Bristol          England   

   Zipcode    Country  ...  Total_Amount Product_Category  Product_Brand  \
0  7

In [93]:
# MARK DOWN FOR DROPPING ATTRIBUTES, JUSTIFY WHY

In [94]:
data = data.drop(columns=['City', 'State', 'Zipcode', 'Country', 'Transaction_ID', 'Customer_ID', 'Name', 'Email', 'Phone', 'Address', 'Order_Status', 'Payment_Method', 'products', 'Year', 'Month', 'Time'])
data.columns

Index(['Age', 'Gender', 'Income', 'Customer_Segment', 'Date',
       'Total_Purchases', 'Amount', 'Total_Amount', 'Product_Category',
       'Product_Brand', 'Product_Type', 'Feedback', 'Shipping_Method',
       'Ratings'],
      dtype='object')

In [95]:
# MARK DOWN FOR KNOWING UNIQUE VALUES PER ATTRIBUTE

In [96]:



# Get the number of unique values per column (attribute)
unique_counts = data.nunique()

# Print the result
print("Number of unique values per attribute:")
print(unique_counts)

Number of unique values per attribute:
Age                     53
Gender                   2
Income                   3
Customer_Segment         3
Date                   366
Total_Purchases         10
Amount              299297
Total_Amount        299305
Product_Category         5
Product_Brand           18
Product_Type            33
Feedback                 4
Shipping_Method          3
Ratings                  5
dtype: int64


In [97]:
# MARK DOWN FOR LOOKING AT UNIQUE VALUES PER ATTRIBUTE

In [98]:

for column in data.columns:
    unique_values = data[column].unique()
    print(f"Unique values in '{column}':")
    print(unique_values)
    print("-" * 50)  # Print a separator line for better readability

Unique values in 'Age':
[21. 19. 48. 56. 22. 58. 29. 46. 25. 64. 31. 53. 32. 43. 69. 49. 61. 41.
 38. 59. 20. 67. 50. 26. 66. 24. 54. 28. 34. 65. 40. 68. 36. 57. 27. 35.
 70. 37. 30. 39. 47. 18. 60. 33. 62. 42. 44. 51. 63. 55. 23. 52. nan 45.]
--------------------------------------------------
Unique values in 'Gender':
['Male' 'Female' nan]
--------------------------------------------------
Unique values in 'Income':
['Low' 'High' 'Medium' nan]
--------------------------------------------------
Unique values in 'Customer_Segment':
['Regular' 'Premium' 'New' nan]
--------------------------------------------------
Unique values in 'Date':
['9/18/2023' '12/31/2023' '4/26/2023' '05-08-23' '01-10-24' '9/21/2023'
 '6/26/2023' '3/24/2023' '01-06-24' '10-04-23' '7/20/2023' '6/21/2023'
 '01-02-24' '05-07-23' '11/18/2023' '6/15/2023' '07-01-23' '4/14/2023'
 '02-07-24' '10/24/2023' '3/21/2023' '08-06-23' '02-03-24' '5/31/2023'
 '2/27/2024' '4/20/2023' '06-03-23' '01-07-24' '4/22/2023' '02-06-24'

In [99]:
# MARK DOWN FOR LOOKING AT DISTRIBUTION OF DATA, MAY NEED TO COMBINE WITH CELL ABOVE

In [100]:
for column in data.columns:
    # Get the counts of unique values and normalize them to get percentages
    value_counts = data[column].value_counts(normalize=True) * 100
    
    print(f"Percentage of unique values in '{column}':")
    print(value_counts)
    print("-" * 50)  # Print a separator line for better readability

Percentage of unique values in 'Age':
Age
20.0    11.488982
46.0    10.190600
26.0     8.173617
22.0     7.627627
34.0     6.842104
23.0     5.974417
19.0     5.581158
21.0     2.668328
24.0     2.050444
55.0     1.812568
48.0     1.526652
70.0     0.905787
43.0     0.892866
59.0     0.890216
36.0     0.888559
68.0     0.884252
33.0     0.883589
64.0     0.876963
44.0     0.875638
56.0     0.872988
30.0     0.870337
65.0     0.870006
60.0     0.868018
62.0     0.867024
47.0     0.864705
40.0     0.863380
41.0     0.859736
25.0     0.859404
50.0     0.859073
35.0     0.857416
52.0     0.857085
28.0     0.856754
58.0     0.855760
39.0     0.855760
69.0     0.855097
38.0     0.854103
42.0     0.854103
57.0     0.853772
61.0     0.853109
63.0     0.852447
37.0     0.852116
27.0     0.851784
49.0     0.851784
67.0     0.845158
51.0     0.844827
45.0     0.844164
53.0     0.843502
29.0     0.841514
32.0     0.838201
54.0     0.833894
18.0     0.821636
66.0     0.819648
31.0     0.817329
Name

In [101]:
# MARK DOWN FOR IDENTIFYING NANS

In [102]:
# Filter rows that contain at least one NaN value
rows_with_nans = data[data.isna().any(axis=1)]

# Print the rows with NaN values
print("Rows with NaN values:")
print(rows_with_nans)
print ("NUM ROWS", len(rows_with_nans))

print ("NUM RECORDS: ", len(data))


Rows with NaN values:
         Age  Gender  Income Customer_Segment       Date  Total_Purchases  \
109     65.0    Male     Low          Regular  6/24/2023              4.0   
123     39.0    Male  Medium          Regular   03-05-23             10.0   
142     37.0    Male    High          Regular   09-12-23              1.0   
174     50.0    Male    High          Regular  7/23/2023              NaN   
232      NaN  Female     Low          Regular   01-09-24             10.0   
...      ...     ...     ...              ...        ...              ...   
301515  36.0  Female  Medium          Regular  9/23/2023              6.0   
301567  37.0    Male     Low              New  2/27/2024              1.0   
301738  35.0    Male     NaN          Premium   11-07-23              2.0   
301875  44.0    Male    High          Premium   01-09-24              6.0   
301883  31.0    Male     NaN          Regular   06-09-23             10.0   

            Amount  Total_Amount Product_Category  Pr

In [103]:
""""
Number of unique values per attribute:
City                   130
State                   54
Zipcode              93978
Country                  5
Age                     53
Gender                   2
Income                   3
Customer_Segment         3
Date                   366
Year                     2
Month                   12
Time                 83677
Total_Purchases         10
Amount              299297
Total_Amount        299305
Product_Category         5
Product_Brand           18
Product_Type            33
Feedback                 4
Shipping_Method          3
Payment_Method           4
Order_Status             4
Ratings                  5
products               318
dtype: int64

Unique values in 'City':
['Dortmund' 'Nottingham' 'Geelong' 'Edmonton' 'Bristol' 'Brisbane'
 'Kitchener' 'Munich' 'Wollongong' 'Cologne' 'Portsmouth' 'San Jose'
 'Hamilton' 'Manchester' 'Cardiff' 'Glasgow' 'Hull' 'Cleveland'
 'Southampton' 'Leipzig' 'Cairns' 'London' 'Bielefeld' 'Düsseldorf'
 'Philadelphia' 'Halifax' 'Montreal' 'Mackay' 'Quebec City' 'Barrie'
 'Adelaide' 'Leeds' 'Plymouth' 'Perth' 'Sheffield' 'Frankfurt' 'Toronto'
 'Essen' 'Kelowna' 'Birmingham' 'Ottawa' 'Liverpool' "St. John's" 'Hobart'
 'Atlanta' 'New Orleans' 'Wichita' 'Albury-Wodonga' 'Winnipeg' 'Vancouver'
 'Hamburg' 'Windsor' 'Calgary' 'Newcastle upon Tyne' 'Townsville' 'Oshawa'
 'Houston' 'Berlin' 'Seattle' 'Charlotte' 'New York' 'Milwaukee'
 'Edinburgh' 'Launceston' 'Bochum' 'Münster' 'Bonn' 'Columbus' 'Melbourne'
 'Leicester' 'Fort Worth' 'Toowoomba' 'Victoria' 'Oxford' 'Canberra'
 'Saskatoon' 'Memphis' 'Regina' 'Hanover' 'Long Beach' 'San Francisco'
 'Nuremberg' 'Minneapolis' 'Colorado Springs' 'Duisburg' 'Denver'
 'Bendigo' 'Bremen' 'Boston' 'Tucson' 'Tulsa' 'Portland' 'Kansas City'
 'Brighton' 'Sydney' 'Darwin' 'Nashville' 'Belfast' 'Baltimore'
 'Virginia Beach' 'Arlington' 'Louisville' 'Mesa' 'Las Vegas' 'Dallas'
 'Washington' 'Gold Coast' 'Miami' 'Newcastle' 'San Antonio' 'Fresno'
 'Phoenix' 'Austin' 'Stuttgart' 'Chicago' 'Wuppertal' 'Jacksonville'
 'Ballarat' 'Omaha' 'Oklahoma City' 'San Diego' 'Sacramento' 'Dresden'
 'Raleigh' 'Oakland' 'Indianapolis' 'Detroit' 'Los Angeles' 'El Paso' nan
 'Albuquerque']
--------------------------------------------------
Unique values in 'State':
...
 'Central AC' 'Ductless AC' 'Portable AC' 'Air conditioner'
 'Floor-standing AC' 'Split AC' 'Window AC' 'Inverter AC' 'Cassette AC'
 'Mini-split AC' 'Package AC']
--------------------------------------------------
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
Percentage of unique values in 'City':
City
Chicago          7.175191
Portsmouth       6.671151
San Francisco    4.038945
Frankfurt        3.374182
Boston           3.120340
                   ...   
Charlotte        0.285656
Arlington        0.283667
Denver           0.283667
Philadelphia     0.278365
San Diego        0.276045
Name: proportion, Length: 130, dtype: float64
--------------------------------------------------
Percentage of unique values in 'State':
State
England            20.899218
Berlin             17.506438
New South Wales    15.019107
Ontario            15.017449
Connecticut         7.177964
Maine               4.041043
Georgia             3.149846
Kansas              1.828131
New Mexico          1.723069
New York            0.330098
Maryland            0.324795
Pennsylvania        0.320155
Texas               0.315846
California          0.315515
Nevada              0.314521
South Carolina      0.313195
Minnesota           0.313195
West Virginia       0.312864
Iowa                0.312864
Ohio                0.311869
Illinois            0.311869
Mississippi         0.310875
Hawaii              0.310212
Louisiana           0.307561
Utah                0.306235
Michigan            0.305904
New Jersey          0.304247
Virginia            0.304247
Alabama             0.301264
Montana             0.300269
Oregon              0.300269
Wyoming             0.300269
South Dakota        0.299607
Delaware            0.299607
Tennessee           0.299275
North Carolina      0.298944
Wisconsin           0.298612
Missouri            0.298612
Florida             0.298612
Nebraska            0.298281
Rhode Island        0.298281
Kentucky            0.297949
Indiana             0.297287
Arkansas            0.296955
New Hampshire       0.296955
Alaska              0.293972
Vermont             0.293641
Colorado            0.292647
North Dakota        0.291984
Washington          0.291321
Idaho               0.290658
Massachusetts       0.288670
Arizona             0.286018
Oklahoma            0.281710
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Zipcode':
Zipcode
68029.0    0.007624
2826.0     0.006630
2891.0     0.006630
68005.0    0.006630
68070.0    0.006630
             ...   
99417.0    0.000331
29716.0    0.000331
26064.0    0.000331
41703.0    0.000331
7510.0     0.000331
Name: proportion, Length: 93978, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Country':
Country
USA          31.558068
UK           20.900845
Germany      17.508509
Australia    15.019272
Canada       15.013306
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Age':
Age
20.0    11.488982
46.0    10.190600
26.0     8.173617
22.0     7.627627
34.0     6.842104
23.0     5.974417
19.0     5.581158
21.0     2.668328
24.0     2.050444
55.0     1.812568
48.0     1.526652
70.0     0.905787
43.0     0.892866
59.0     0.890216
36.0     0.888559
68.0     0.884252
33.0     0.883589
64.0     0.876963
44.0     0.875638
56.0     0.872988
30.0     0.870337
65.0     0.870006
60.0     0.868018
62.0     0.867024
47.0     0.864705
40.0     0.863380
41.0     0.859736
25.0     0.859404
50.0     0.859073
35.0     0.857416
52.0     0.857085
28.0     0.856754
58.0     0.855760
39.0     0.855760
69.0     0.855097
38.0     0.854103
42.0     0.854103
57.0     0.853772
61.0     0.853109
63.0     0.852447
37.0     0.852116
49.0     0.851784
27.0     0.851784
67.0     0.845158
51.0     0.844827
45.0     0.844164
53.0     0.843502
29.0     0.841514
32.0     0.838201
54.0     0.833894
18.0     0.821636
66.0     0.819648
31.0     0.817329
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Gender':
Gender
Male      62.182086
Female    37.817914
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Income':
Income
Medium    43.162535
Low       31.904083
High      24.933382
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Customer_Segment':
Customer_Segment
Regular    48.450438
New        30.214881
Premium    21.334681
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Date':
Date
11/22/2023    0.301673
2/22/2024     0.299684
"""


'"\nNumber of unique values per attribute:\nCity                   130\nState                   54\nZipcode              93978\nCountry                  5\nAge                     53\nGender                   2\nIncome                   3\nCustomer_Segment         3\nDate                   366\nYear                     2\nMonth                   12\nTime                 83677\nTotal_Purchases         10\nAmount              299297\nTotal_Amount        299305\nProduct_Category         5\nProduct_Brand           18\nProduct_Type            33\nFeedback                 4\nShipping_Method          3\nPayment_Method           4\nOrder_Status             4\nRatings                  5\nproducts               318\ndtype: int64\n\nUnique values in \'City\':\n[\'Dortmund\' \'Nottingham\' \'Geelong\' \'Edmonton\' \'Bristol\' \'Brisbane\'\n \'Kitchener\' \'Munich\' \'Wollongong\' \'Cologne\' \'Portsmouth\' \'San Jose\'\n \'Hamilton\' \'Manchester\' \'Cardiff\' \'Glasgow\' \'Hull\' \'Cleveland\'\n 

In [104]:
"""07-01-23      0.297032
7/30/2023     0.297032
12/28/2023    0.296369
                ...   
12/27/2023    0.250289
5/24/2023     0.249958
12/21/2023    0.249626
11-02-23      0.248632
12/17/2023    0.240012
Name: proportion, Length: 366, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Year':
Year
2023.0    83.488696
2024.0    16.511304
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Month':
Month
April        13.687748
January      12.356456
August       10.940654
July         10.236067
May           9.389303
March         6.343935
October       6.338633
December      6.278647
September     6.182536
November      6.103328
June          6.091397
February      6.051296
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Time':
Time
2:16:01     0.004641
15:54:27    0.004641
2:55:36     0.004641
7:45:07     0.004641
2:18:09     0.004309
              ...   
6:47:54     0.000331
13:54:25    0.000331
12:49:44    0.000331
18:18:14    0.000331
17:12:59    0.000331
Name: proportion, Length: 83677, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Total_Purchases':
Total_Purchases
5.0     10.582167
2.0     10.577857
1.0     10.573216
3.0     10.562608
4.0     10.467464
8.0      9.515032
6.0      9.459670
9.0      9.433149
7.0      9.429834
10.0     9.399003
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Amount':
Amount
296.542326    0.000663
47.835704     0.000663
298.520071    0.000663
112.731674    0.000663
454.852896    0.000663
                ...   
446.035498    0.000332
312.675052    0.000332
193.549487    0.000332
415.416260    0.000332
403.353907    0.000332
Name: proportion, Length: 299297, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Total_Amount':
Total_Amount
1177.654223    0.000663
2198.252948    0.000663
1561.495184    0.000663
2205.159123    0.000663
1366.645119    0.000663
                 ...   
2676.212987    0.000331
1563.375261    0.000331
193.549487     0.000331
4154.162595    0.000331
2630.714413    0.000331
Name: proportion, Length: 299305, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Product_Category':
Product_Category
Electronics    23.596165
Grocery        22.134579
Clothing       18.142559
Books          18.103120
Home Decor     18.023578
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Product_Brand':
Product_Brand
Pepsi                10.039473
Coca-Cola             6.095536
Samsung               6.090233
Zara                  6.088245
HarperCollins         6.082611
Sony                  6.071342
Bed Bath & Beyond     6.046485
Adidas                6.044497
Home Depot            6.013675
Nike                  6.011686
Penguin Books         6.011355
Random House          6.006052
Nestle                6.001743
Apple                 5.989812
IKEA                  5.963630
Whirepool             2.467446
Mitsubhisi            2.228490
BlueStar              0.747691
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Product_Type':
Product_Type
Water                                 8.098738
Smartphone                            6.115029
Non-Fiction                           6.017019
Fiction                               5.974968
Juice                                 4.049535
Television                            4.038608
T-shirt                               4.033310
Decorations                           4.032317
Shoes                                 4.019734
Tablet                                4.003510
Soft Drink                            3.985299
Furniture                             3.963114
Fridge                                2.465150
Mitsubishi 1.5 Ton 3 Star Split AC    2.226416
Thriller                              2.073441
Kitchen                               2.062183
Coffee                                2.045628
Children's                            2.035363
Jeans                                 2.031059
Shirt                                 2.030396
Dress                                 2.027416
Shorts                                2.015496
Headphones                            2.013178
Lighting                              2.011523
Chocolate                             2.003576
Literature                            2.003576
Bathroom                              1.995298
Bedding                               1.990331
Jacket                                1.988014
Laptop                                1.980398
Tools                                 1.969471
Snacks                                1.953909
BlueStar AC                           0.746995
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Feedback':
Feedback
Excellent    33.381816
Good         31.507557
Average      20.764944
Bad          14.345683
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Shipping_Method':
Shipping_Method
Same-Day    34.526126
Express     33.929453
Standard    31.544421
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Payment_Method':
Payment_Method
Credit Card    29.868120
Debit Card     25.451340
Cash           24.463646
PayPal         20.216895
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Order_Status':
Order_Status
Delivered     43.227239
Shipped       21.547179
Processing    18.954188
Pending       16.271394
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'Ratings':
Ratings
4.0    32.490243
2.0    20.764944
5.0    16.616527
3.0    15.782603
1.0    14.345683
Name: proportion, dtype: float64
--------------------------------------------------
Percentage of unique values in 'products':
products
Spring water       0.830436
Bottled water      0.826463
Mystery            0.826463
Artesian water     0.820503
Distilled water    0.819509
                     ...   
Screwdriver set    0.185093
Towel rack         0.184762
Razer Blade        0.184100
Parka              0.181782
Package AC         0.073839
Name: proportion, Length: 318, dtype: float64
--------------------------------------------------"""

"07-01-23      0.297032\n7/30/2023     0.297032\n12/28/2023    0.296369\n                ...   \n12/27/2023    0.250289\n5/24/2023     0.249958\n12/21/2023    0.249626\n11-02-23      0.248632\n12/17/2023    0.240012\nName: proportion, Length: 366, dtype: float64\n--------------------------------------------------\nPercentage of unique values in 'Year':\nYear\n2023.0    83.488696\n2024.0    16.511304\nName: proportion, dtype: float64\n--------------------------------------------------\nPercentage of unique values in 'Month':\nMonth\nApril        13.687748\nJanuary      12.356456\nAugust       10.940654\nJuly         10.236067\nMay           9.389303\nMarch         6.343935\nOctober       6.338633\nDecember      6.278647\nSeptember     6.182536\nNovember      6.103328\nJune          6.091397\nFebruary      6.051296\nName: proportion, dtype: float64\n--------------------------------------------------\nPercentage of unique values in 'Time':\nTime\n2:16:01     0.004641\n15:54:27    0.004641

In [105]:
# MARK DOWN FOR DROPPING NANS, EXPLAIN WHY RATHER THAN FORWARD FILLING

In [106]:
# Drop all rows with NaN values
data = data.dropna()
# Show the first few rows of the cleaned data
data.head()
print ("NUm remaining", len(data))

NUm remaining 298513


In [107]:
# MARK DOWN FOR FEATURE ENGINEERING FEATURE WE WANT TO PREDICT

In [None]:
# Add new features supposed from discord

item_to_category = {
    'Water': 'Drink',
    'Juice': 'Drink',
    'Soft Drink': 'Drink',
    'Coffee': 'Drink',
    'Chocolate': 'Food',
    'Snacks': 'Food',
    'Non-Fiction': 'Media',
    'Fiction': 'Media',
    'Thriller': 'Media',
    'Literature': 'Media',
    'Television': 'Media',
    'Mitsubishi AC': 'HVAC',
    'BlueStar AC': 'HVAC',
    'Fridge': 'Kitchen Equipment',
    'Kitchen': 'Kitchen Equipment',
    'T-Shirt': 'Upper Body Clothing',
    'Shirt': 'Upper Body Clothing',
    'Dress': 'Upper Body Clothing',
    'Jacket': 'Upper Body Clothing',
    'Shoes': 'Lower Body Clothing',
    'Jeans': 'Lower Body Clothing',
    'Shorts': 'Lower Body Clothing',
    'Decorations': 'Home Decor',
    'Furniture': 'Home Decor',
    'Lighting': 'Home Decor',
    'Bathroom': 'Home Decor',
    'Bedding': 'Home Decor',
    'Smartphone': 'Electronics',
    'Tablet': 'Electronics',
    'Headphones': 'Electronics',
    'Laptop': 'Electronics',
    'Children\'s': 'Children\'s',
    'Tools': 'Tools'
}

# Product_subcategory

# Update records with new sub_category column
data['Product_Subcategory'] = data['Product_Type'].map(item_to_category)
last_8_columns = data.iloc[:, -8:]  # Select the last 8 columns
print(last_8_columns.head(5))  # Display the first 5 rows

data = data.drop(columns=['Product_Type'])

# Convert 'Date' column to day_of_year
data['Date'] = pd.to_datetime(data['Date'], errors='coerce', format=None)

data['day_of_year'] = data['Date'].dt.dayofyear

KeyError: 'Product_Type'

In [None]:
# MARK DOWN FOR ONE HOT ENCODING

In [None]:
# One hot encoding for Categorical Data
data_encoded = pd.get_dummies(data, columns=['Gender', 'Feedback', 'Product_Subcategory', 'Income', 'Product_Category', 'Product_Brand', 'Customer_Segment', 'Shipping_Method'], drop_first=True)
data_encoded.head()

KeyError: "['Product_Subcategory'] not in index"

In [None]:
# MARK DOWN FOR NORMALIZING

In [None]:
# NORMALIZE
from sklearn.preprocessing import StandardScaler
data_normalized = data_encoded.copy()
scaler = StandardScaler()
data_normalized[data_normalized.columns] = scaler.fit_transform(data_normalized[data_normalized.columns])
data_normalized.head()

Unnamed: 0,Age,Total_Purchases,Amount,Total_Amount,Ratings,Gender_Male,Feedback_Bad,Feedback_Excellent,Feedback_Good,Product_Subcategory_Drink,...,Product_Brand_Pepsi,Product_Brand_Random House,Product_Brand_Samsung,Product_Brand_Sony,Product_Brand_Whirepool,Product_Brand_Zara,Customer_Segment_Premium,Customer_Segment_Regular,Shipping_Method_Same-Day,Shipping_Method_Standard
0,-0.963557,-0.822643,-1.040313,-0.924335,1.391214,0.779957,-0.409342,1.4131,-0.67843,-0.471463,...,-0.334225,-0.252906,-0.254408,-0.254311,-0.159349,-0.254617,-0.520476,1.031396,1.377106,-0.679072
1,-1.096704,-1.171239,1.048523,-0.496771,0.634115,-1.282122,-0.409342,1.4131,-0.67843,-0.471463,...,-0.334225,-0.252906,3.930694,-0.254311,-0.159349,-0.254617,1.921319,-0.96956,-0.726161,1.472599
2,0.833929,-0.822643,0.702821,-0.269333,-0.880084,0.779957,-0.409342,-0.707664,-0.67843,-0.471463,...,-0.334225,-0.252906,-0.254408,-0.254311,-0.159349,-0.254617,-0.520476,1.031396,1.377106,-0.679072
3,1.366517,0.571744,0.688181,0.973987,0.634115,0.779957,-0.409342,1.4131,-0.67843,-0.471463,...,-0.334225,-0.252906,-0.254408,-0.254311,-0.159349,-0.254617,1.921319,-0.96956,-0.726161,1.472599
4,-0.896983,-1.171239,-0.925392,-0.991252,-1.637183,0.779957,2.442942,-0.707664,-0.67843,-0.471463,...,-0.334225,-0.252906,-0.254408,-0.254311,-0.159349,-0.254617,1.921319,-0.96956,-0.726161,1.472599


In [None]:
# Model, LETS DO EVERY METHOD : SVM, Decision Tree, KNN, Neural Nets, Naive Bayes, Basically Ensemble Methods

# Make a decision tree model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Split the data into training and test sets
training_data, test_data = train_test_split(data_normalized, test_size=0.2, random_state=42)

: 