# Importing Data

In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "cleaned_dataset.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "litonislam/amazon-sales-analysis-cleaned-data",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

df.head()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Data analysis and Cleaning

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes.value_counts()

In [None]:
df.isnull().sum()

In [None]:
data = df.copy()

In [None]:
col = data.dtypes == object
cat_data = data.columns[col]
data = data.drop(columns=cat_data)
data.head(10)

In [None]:
data = data.drop(columns=['order_id', 'product_id'])


In [None]:
corr_mat = data.corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_mat,
            annot=True,       # Display correlation coefficients in the cells
            cmap='coolwarm',  # Choose a color map (e.g., 'coolwarm', 'viridis', 'RdYlBu_r')
            fmt='.2f',        # Format the annotations to two decimal places
            linewidths=0.5,   # Add lines between cells
            center=0)
plt.title('Correlation Matrix')

In [None]:
sns.pairplot(data)

## Data Transformation

In [None]:
data['total_revenue'].hist(figsize=(10, 8), alpha=0.5, color='green')
data['profit'].hist(figsize=(10, 8), alpha=0.5, color='purple')

In [None]:
from scipy.stats.mstats import normaltest # D'Agostino K^2 Test
undist_data = data[['total_revenue', 'profit']]
for col in undist_data:
    k2, p_val = normaltest(data[col])
    print(f"Column: {col}")
    print(f"  Statistic: {k2:.4f}, P-value: {p_val:.4f}")

In [None]:
from scipy.stats import boxcox
tot_rev_dis = boxcox(data['total_revenue'])
plt.hist(tot_rev_dis[0])

In [None]:
normaltest(tot_rev_dis[0])

In [None]:
pd.DataFrame(tot_rev_dis[0]).skew()

In [None]:
tot_prof_dis = boxcox(data['profit'])
plt.hist(tot_prof_dis[0])

In [None]:
data['total_revenue'] = tot_rev_dis[0]
data['profit'] = tot_prof_dis[0]
data

In [None]:
obj_data = df[cat_data]
obj_data = obj_data.drop(columns=['order_date'])
obj_data.head(10)

## Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

obj_data_ohc = obj_data.copy()

le = LabelEncoder()
ohc = OneHotEncoder()

for col in obj_data_ohc:
    print(obj_data_ohc[col])
    dat = le.fit_transform(obj_data_ohc[col]).astype(int)
    #remove the Original column from the dataframe
    obj_data_ohc = obj_data_ohc.drop(col, axis=1)
    new_dat = ohc.fit_transform(dat.reshape(-1,1))

    #Create unique column names
    n_cols = new_dat.shape[1]
    col_names = ['-'.join([col, str(x)]) for x in range(n_cols)]

    #Create new DataFrame
    new_df = pd.DataFrame(
        new_dat.toarray(),
        index=obj_data_ohc.index,
        columns=col_names
    )

    #Append the new data to the dataframe
    obj_data_ohc = pd.concat([obj_data_ohc, new_df], axis=1)

In [None]:
obj_data_ohc

In [None]:
arp_df = pd.concat([obj_data_ohc, data], axis=1)
arp_df.head(10)

# Prediction Set-up

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_predict


X = arp_df.drop(['total_revenue'], axis=1)
y = arp_df['total_revenue']
rs = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

In [None]:
lr = LinearRegression()

r2_score_df = list()

lr = lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)

r2_score_df.append(pd.Series({'train': r2_score(y_train, y_train_pred),
                             'test': r2_score(y_test, y_test_pred)},
                            name='predictions'))
r2_score_df


In [None]:
kf = KFold(shuffle=True, random_state=rs, n_splits=5)

In [None]:
predictions = cross_val_predict(lr, X, y, cv=kf)
r2_score(y, predictions)

In [None]:
alphas = np.geomspace(1e-20, 5, 50)

scores = []
coefs = []
for alpha in alphas:
    las = Lasso(alpha=alpha, max_iter=100000)

    estimator = Pipeline([
        ('lasso_regression', las),
    ])
    
    predictions = cross_val_predict(estimator, X, y, cv=kf)
    
    score = r2_score(y, predictions)
   
    scores.append(score)

In [None]:
list(zip(alphas,scores))

In [None]:
plt.figure(figsize=(10,8))
plt.semilogx(alphas, scores, '-o')
plt.xlabel('$\\alpha$')
plt.ylabel('$R^2$')

In [None]:
alphas = np.geomspace(3, 20, 50)

scores = []
coefs = []
for alpha in alphas:
    ridge = Ridge(alpha=alpha, max_iter=100000)

    estimator = Pipeline([
        ('ridge_regression', ridge),
    ])
    
    predictions = cross_val_predict(estimator, X, y, cv=kf)
    
    score = r2_score(y, predictions)
   
    scores.append(score)