# Assignment 1

_This notebook contains a structured data science workflow._

## 📌 Objective
Brief description of the project goals and problem statement.

## 📁 Import Libraries
# Step 1: Import Required Libraries

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import os

# Locate the dataset file
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load the dataset with correct encoding
df = pd.read_csv('/kaggle/input/superstore1/Sample - Superstore.csv', encoding='latin1')

# Display first few rows
df.head()

/kaggle/input/superstore1/Sample - Superstore.csv


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


## 📊 Load Dataset
# Step 2: Load and Preview Dataset

In [3]:
# Basic info
df.info()

# Preview data
df.head()

# Summary statistics
df.describe(include='all')

# Check for missing values
df.isnull().sum()

# Check for duplicates
df.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

0

## 📈 Exploratory Data Analysis
# Step 3: Exploratory Data Analysis (EDA)

In [11]:
# METHOD 1
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 1) Load & basic drop
df = pd.read_csv('/kaggle/input/superstore1/Sample - Superstore.csv', encoding='latin1')
df1 = df.drop(columns=[
    'Row ID','Order ID','Customer ID','Product ID',
    'Customer Name','Product Name','Postal Code','Country'
])

# 2) Date → numeric
df1['Order Date'] = pd.to_datetime(df1['Order Date'])
df1['Ship Date']  = pd.to_datetime(df1['Ship Date'])
df1['Order Month'] = df1['Order Date'].dt.month
df1['Ship Delay']  = (df1['Ship Date'] - df1['Order Date']).dt.days
df1 = df1.drop(columns=['Order Date','Ship Date'])

# 3) Trim Profit outliers (IQR)
Q1, Q3 = df1['Profit'].quantile([0.25,0.75])
IQR   = Q3 - Q1
df1 = df1[(df1['Profit'] >= Q1 - 1.5*IQR) & (df1['Profit'] <= Q3 + 1.5*IQR)]

# 4) Target & one‑hot encode
df1['Profitable'] = (df1['Profit'] > 0).astype(int)
X1 = pd.get_dummies(df1.drop(['Profit','Profitable'], axis=1), drop_first=True)
y1 = df1['Profitable']

# 5) Train/Test split & model
X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1, test_size=0.2, random_state=42
)
clf1 = RandomForestClassifier(random_state=42)
clf1.fit(X1_train, y1_train)
y1_pred = clf1.predict(X1_test)

# 6) Print Method 1 metrics
print("=== Method 1: Basic Clean + Date Features ===")
print("Accuracy :", accuracy_score(y1_test, y1_pred))
print("Precision:", precision_score(y1_test, y1_pred))
print("Recall   :", recall_score(y1_test, y1_pred))
print("F1 Score :", f1_score(y1_test, y1_pred))
print("Confusion Matrix:\n", confusion_matrix(y1_test, y1_pred))

=== Method 1: Basic Clean + Date Features ===
Accuracy : 0.9297597042513863
Precision: 0.9337094499294781
Recall   : 0.9851190476190477
F1 Score : 0.9587255611875451
Confusion Matrix:
 [[ 185   94]
 [  20 1324]]


## 🧹 Data Cleaning & Preprocessing
# Step 4: Data Cleaning and Preprocessing

In [15]:
# METHOD 2 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 1) Load and copy
df = pd.read_csv('/kaggle/input/superstore1/Sample - Superstore.csv', encoding='latin1')
df2 = df.copy()

# 2) Label-encode core categoricals
le = LabelEncoder()
for col in ['Ship Mode','Segment','Region','Category','Sub-Category']:
    df2[col] = le.fit_transform(df2[col])

# 3) Date → numeric features
df2['Order Date'] = pd.to_datetime(df2['Order Date'])
df2['Ship Date']  = pd.to_datetime(df2['Ship Date'])
df2['Order Month'] = df2['Order Date'].dt.month
df2['Ship Delay']  = (df2['Ship Date'] - df2['Order Date']).dt.days

# 4) Drop unneeded columns
drop_cols = [
    'Row ID','Order ID','Customer ID','Product ID',
    'Customer Name','Product Name','Postal Code','Country',
    'Order Date','Ship Date','Profit'
]
df2 = df2.drop(columns=drop_cols)

# 5) Define target
df2['Profitable'] = (df['Profit'] > 0).astype(int)
X2 = df2.drop('Profitable', axis=1)
y2 = df2['Profitable']

# — FIX: One‑hot encode ANY leftover categorical—
X2 = pd.get_dummies(X2, drop_first=True)

# 6) Train/Test split & model
X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2, test_size=0.2, random_state=42
)
clf2 = RandomForestClassifier(random_state=42)
clf2.fit(X2_train, y2_train)
y2_pred = clf2.predict(X2_test)

# 7) Print Method 2 metrics
print("=== Method 2: Label Encode + Feature Eng. (fixed) ===")
print("Accuracy :", accuracy_score(y2_test, y2_pred))
print("Precision:", precision_score(y2_test, y2_pred))
print("Recall   :", recall_score(y2_test, y2_pred))
print("F1 Score :", f1_score(y2_test, y2_pred))
print("Confusion Matrix:\n", confusion_matrix(y2_test, y2_pred))


=== Method 2: Label Encode + Feature Eng. (fixed) ===
Accuracy : 0.9449724862431216
Precision: 0.9468085106382979
Recall   : 0.9876695437731196
F1 Score : 0.9668074834037416
Confusion Matrix:
 [[ 287   90]
 [  20 1602]]


## 🤖 Modeling
# Step 5: Model Selection and Training

In [16]:
# Evaluation Block

import pandas as pd

# Collect metrics from each run
metrics = {
    'Method': [
        'Basic Clean + Date Features',
        'Label Encode + Feature Eng.'
    ],
    'Accuracy': [
        accuracy_score(y1_test, y1_pred),
        accuracy_score(y2_test, y2_pred)
    ],
    'Precision': [
        precision_score(y1_test, y1_pred),
        precision_score(y2_test, y2_pred)
    ],
    'Recall': [
        recall_score(y1_test, y1_pred),
        recall_score(y2_test, y2_pred)
    ],
    'F1 Score': [
        f1_score(y1_test, y1_pred),
        f1_score(y2_test, y2_pred)
    ]
}

# Build DataFrame and display
comparison_df = pd.DataFrame(metrics)
print("=== Model Comparison ===")
print(comparison_df)


=== Model Comparison ===
                        Method  Accuracy  Precision    Recall  F1 Score
0  Basic Clean + Date Features  0.929760   0.933709  0.985119  0.958726
1  Label Encode + Feature Eng.  0.944972   0.946809  0.987670  0.966807


## 📉 Evaluation
# Step 6: Model Evaluation

## ✅ Conclusion
# Step 7: Conclusion and Future Work