# SUPERSTORE ML APPLICATION
After analyzing the Superstore dataset, I am eager to apply my machine learning skills to conduct predictive analysis and deepen my understanding of the data. I plan to build a model that predicts whether a deal will be profitable, as well as another model to segment customers based on their total sales, total profit, and various other features.

In [None]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
df = pd.read_csv('superstore2.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Category,City,Country,Customer.ID,Customer.Name,Discount,Market,Order.Date,Order.ID,...,Shipping.Cost,State,Sub.Category,Year,Market2,weeknum,Month,Day,Date,Shipping.Duration
0,0,Office Supplies,Los Angeles,United States,LS-172304,Lycoris Saunders,0.0,US,2011-01-07,CA-2011-130813,...,4.37,California,Paper,2011,North America,2,1,7,2011-01-07,2 days
1,1,Office Supplies,Los Angeles,United States,MV-174854,Mark Van Huff,0.0,US,2011-01-21,CA-2011-148614,...,0.94,California,Paper,2011,North America,4,1,21,2011-01-21,5 days
2,2,Office Supplies,Los Angeles,United States,CS-121304,Chad Sievert,0.0,US,2011-08-05,CA-2011-118962,...,1.81,California,Paper,2011,North America,32,8,5,2011-08-05,4 days
3,3,Office Supplies,Los Angeles,United States,CS-121304,Chad Sievert,0.0,US,2011-08-05,CA-2011-118962,...,4.59,California,Paper,2011,North America,32,8,5,2011-08-05,4 days
4,4,Office Supplies,Los Angeles,United States,AP-109154,Arthur Prichep,0.0,US,2011-09-29,CA-2011-146969,...,1.32,California,Paper,2011,North America,40,9,29,2011-09-29,4 days


## Profitability Prediction

In [None]:
df.columns

Index(['Unnamed: 0', 'Category', 'City', 'Country', 'Customer.ID',
       'Customer.Name', 'Discount', 'Market', 'Order.Date', 'Order.ID',
       'Order.Priority', 'Product.ID', 'Product.Name', 'Profit', 'Quantity',
       'Region', 'Row.ID', 'Sales', 'Segment', 'Ship.Date', 'Ship.Mode',
       'Shipping.Cost', 'State', 'Sub.Category', 'Year', 'Market2', 'weeknum',
       'Month', 'Day', 'Date', 'Shipping.Duration'],
      dtype='object')

In [None]:
#creating a target column
df['Profitable'] = (df['Profit'] > 0).astype(int)

In [None]:
#dropping columns we are not going to use
drop_cols = ['Row.ID', 'Order.ID', 'Customer.ID', 'Product.ID', 'Order.Date', 'Ship.Date', 'Profit']
df = df.drop(columns=drop_cols)
df = df.drop(columns=[
    'Customer.Name', 'Product.Name',
    'Date', 'Unnamed: 0'])

In [None]:
# Convert "4 days" → 4
df['Shipping.Duration'] = df['Shipping.Duration'].str.extract('(\d+)').astype(float)


In [None]:
categorical_cols = ['Category', 'Sub.Category', 'Region', 'Segment', 'Market', 'Ship.Mode', 'Order.Priority']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
df = df.drop(columns=['City', 'Country', 'State', 'Market2'])


In [None]:
df.columns

Index(['Discount', 'Quantity', 'Sales', 'Shipping.Cost', 'Year', 'weeknum',
       'Month', 'Day', 'Shipping.Duration', 'Profitable',
       'Category_Office Supplies', 'Category_Technology',
       'Sub.Category_Appliances', 'Sub.Category_Art', 'Sub.Category_Binders',
       'Sub.Category_Bookcases', 'Sub.Category_Chairs', 'Sub.Category_Copiers',
       'Sub.Category_Envelopes', 'Sub.Category_Fasteners',
       'Sub.Category_Furnishings', 'Sub.Category_Labels',
       'Sub.Category_Machines', 'Sub.Category_Paper', 'Sub.Category_Phones',
       'Sub.Category_Storage', 'Sub.Category_Supplies', 'Sub.Category_Tables',
       'Region_Canada', 'Region_Caribbean', 'Region_Central',
       'Region_Central Asia', 'Region_EMEA', 'Region_East', 'Region_North',
       'Region_North Asia', 'Region_Oceania', 'Region_South',
       'Region_Southeast Asia', 'Region_West', 'Segment_Corporate',
       'Segment_Home Office', 'Market_Africa', 'Market_Canada', 'Market_EMEA',
       'Market_EU', 'Market_LA

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Profitable', axis=1)
y = df['Profitable']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = LogisticRegression(max_iter=100000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9183076623123416

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.76      0.83      2655
           1       0.92      0.97      0.95      7603

    accuracy                           0.92     10258
   macro avg       0.91      0.87      0.89     10258
weighted avg       0.92      0.92      0.92     10258


Confusion Matrix:
 [[2024  631]
 [ 207 7396]]
