### **Ridge Regularization**

**Step 1 : Import Necessary Libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error,r2_score
import xlrd

**Step 2 : Load the Dataset**

In [2]:
df = pd.read_excel("E:\\Machine Learning\\global_superstore\\Global Superstore.xls")
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
0,32298,CA-2012-124891,2012-07-31,2012-07-31,Same Day,RH-19495,Rick Hansen,Consumer,New York City,New York,...,TEC-AC-10003033,Technology,Accessories,Plantronics CS510 - Over-the-Head monaural Wir...,2309.65,7,0.0,762.1845,933.57,Critical
1,26341,IN-2013-77878,2013-02-05,2013-02-07,Second Class,JR-16210,Justin Ritter,Corporate,Wollongong,New South Wales,...,FUR-CH-10003950,Furniture,Chairs,"Novimex Executive Leather Armchair, Black",3709.395,9,0.1,-288.765,923.63,Critical
2,25330,IN-2013-71249,2013-10-17,2013-10-18,First Class,CR-12730,Craig Reiter,Consumer,Brisbane,Queensland,...,TEC-PH-10004664,Technology,Phones,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.971,915.49,Medium
3,13524,ES-2013-1579342,2013-01-28,2013-01-30,First Class,KM-16375,Katherine Murray,Home Office,Berlin,Berlin,...,TEC-PH-10004583,Technology,Phones,"Motorola Smart Phone, Cordless",2892.51,5,0.1,-96.54,910.16,Medium
4,47221,SG-2013-4320,2013-11-05,2013-11-06,Same Day,RH-9495,Rick Hansen,Consumer,Dakar,Dakar,...,TEC-SHA-10000501,Technology,Copiers,"Sharp Wireless Fax, High-Speed",2832.96,8,0.0,311.52,903.04,Critical


In [3]:
df.shape

(51290, 24)

In [4]:
df.isnull().sum()

Row ID                0
Order ID              0
Order Date            0
Ship Date             0
Ship Mode             0
Customer ID           0
Customer Name         0
Segment               0
City                  0
State                 0
Country               0
Postal Code       41296
Market                0
Region                0
Product ID            0
Category              0
Sub-Category          0
Product Name          0
Sales                 0
Quantity              0
Discount              0
Profit                0
Shipping Cost         0
Order Priority        0
dtype: int64

**Step 3 : Data preprocessing**

In [5]:
df = df.drop(columns = ['Postal Code'], axis = 1)

**Step 4 : Correlationship between columns**

In [6]:
df_num = df.select_dtypes(include = [np.number])
df_cat = df.select_dtypes(include = ['object'])

print(df_num.columns)
print(df_cat.columns)

Index(['Row ID', 'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost'], dtype='object')
Index(['Order ID', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment',
       'City', 'State', 'Country', 'Market', 'Region', 'Product ID',
       'Category', 'Sub-Category', 'Product Name', 'Order Priority'],
      dtype='object')


In [8]:
df_num_corr = df_num.corr()
df_num_list = []

df_num_list.extend(df_num_corr[(df_num_corr['Profit'] > 0.3)].index.values)
df_num_list.extend(df_num_corr[(df_num_corr['Profit'] < 0.3)].index.values)

In [9]:
df_num_list

['Sales', 'Profit', 'Shipping Cost', 'Row ID', 'Quantity', 'Discount']

In [10]:
df_cat['P'] = df_num['Profit']

In [12]:
from scipy.stats import f_oneway

In [13]:
influence_list = []
noninfluence_list = []
for influence1 in list (df_cat.columns):
    if influence1 == 'P':
        continue
    else:
        groups = [df_cat['P'][df_cat[influence1] == category] for category in df_cat[influence1].unique()]
        f_stat, p_value = f_oneway(*groups)
        print(f"column : {influence1}, F-statistic: {f_stat}, P-value: {p_value}")
        if p_value < 0.05:
            influence_list.append(influence1)
        else:
            noninfluence_list.append(influence1)

column : Order ID, F-statistic: 1.0775148134190364, P-value: 1.1350594713462918e-09
column : Ship Mode, F-statistic: 0.1112694400346025, P-value: 0.953549120213493
column : Customer ID, F-statistic: 1.1374918958952611, P-value: 0.00012663690762868544
column : Customer Name, F-statistic: 1.1538436569505661, P-value: 0.0017954686365357188
column : Segment, F-statistic: 0.22119237404323483, P-value: 0.8015632303031714
column : City, F-statistic: 2.164422870379703, P-value: 1.9283854765863786e-276
column : State, F-statistic: 4.709630866402644, P-value: 0.0
column : Country, F-statistic: 22.25802770920221, P-value: 0.0
column : Market, F-statistic: 28.07818874959561, P-value: 1.0834601137634787e-33
column : Region, F-statistic: 34.26121568235655, P-value: 3.617471983600004e-80
column : Product ID, F-statistic: 4.007224887155522, P-value: 0.0
column : Category, F-statistic: 304.50613538510225, P-value: 3.4173111634965594e-132
column : Sub-Category, F-statistic: 103.91551183097653, P-value: 

In [None]:
influence_list

In [None]:
noninfluence_list

In [14]:
df_cat1 = df_cat[influence_list]
df_cat1.columns

Index(['Order ID', 'Customer ID', 'Customer Name', 'City', 'State', 'Country',
       'Market', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name'],
      dtype='object')

In [19]:
df_cat1.shape

(51290, 12)

In [17]:
df_num1 = df_num[['Sales', 'Profit', 'Shipping Cost', 'Row ID', 'Quantity', 'Discount']]
df_num1.columns

Index(['Sales', 'Profit', 'Shipping Cost', 'Row ID', 'Quantity', 'Discount'], dtype='object')

**Step 5 : Encoding Categorical columns**

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_cat2 = df_cat1.apply(le.fit_transform)
df_cat2.head()

Unnamed: 0,Order ID,Customer ID,Customer Name,City,State,Country,Market,Region,Product ID,Category,Sub-Category,Product Name
0,1499,1286,632,2290,703,139,6,6,8246,2,0,2750
1,13063,808,413,3518,702,6,0,9,907,0,5,2525
2,12983,336,181,497,820,6,0,9,10157,2,13,2502
3,6813,873,424,375,145,47,4,3,10146,2,13,2414
4,21702,1290,632,857,270,110,1,0,10249,2,6,3158


In [21]:
df_cat2.shape

(51290, 12)

**Step 6 : Outlier Removal by IQR**

In [48]:
def remove_outliers_iqr(df,columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df = df[~((df[columns] < lower_bound) | (df[columns] > upper_bound)).any(axis=1)]

    return df

In [50]:
columns_to_check = ['Sales', 'Profit', 'Shipping Cost', 'Row ID', 'Quantity', 'Discount']
df_no_outliers = remove_outliers_iqr(df_num1, columns_to_check)
df_no_outliers.shape

(36311, 6)

**Step 7 : Implementing Scaler on numerical columns**  

In [51]:
from sklearn.preprocessing import StandardScaler
columns_to_scale = ['Sales', 'Profit', 'Shipping Cost', 'Row ID', 'Quantity', 'Discount']
scaler = StandardScaler()
df_standard_scaled = df_no_outliers.copy()
df_standard_scaled[columns_to_scale] = scaler.fit_transform(df_no_outliers[columns_to_scale])


**Step 8 :Splitting the data in to training and testing sets**

In [52]:
final_df = pd.concat([df_standard_scaled,df_cat2], axis = 1)
final_df.head()

Unnamed: 0,Sales,Profit,Shipping Cost,Row ID,Quantity,Discount,Order ID,Customer ID,Customer Name,City,State,Country,Market,Region,Product ID,Category,Sub-Category,Product Name
5909,2.723697,1.15499,4.454636,0.480309,-0.056329,0.729762,3092,1107,576,859,982,139,6,3,9968,2,13,2509
5914,0.621737,0.282391,4.44716,-0.040449,2.075938,0.056412,10041,753,354,26,924,6,0,9,1285,0,9,2885
5915,3.144695,2.360535,4.44716,0.578931,-0.589396,-0.616938,3686,1517,741,904,646,139,6,3,2174,0,16,596
5921,1.572931,1.115667,4.442487,-0.041539,-1.122463,-0.616938,13400,1259,663,3484,1086,26,0,8,8003,2,0,3044
5922,3.520546,-1.881328,4.440618,0.415266,1.542871,2.076463,10640,37,19,217,96,91,0,9,216,0,4,800


**Step 9 : Implementing Ridge regularization**

In [56]:
from sklearn.linear_model import Ridge

In [55]:
# Drop rows with any missing values
df_clean = final_df.dropna()

# Separate features and target again
X = df_clean.drop(columns=['Profit'])
y = df_clean['Profit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Step 10 : Train the model**

In [58]:
# Step 4: Train the Ridge regression model
ridge_reg = Ridge()
ridge_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = ridge_reg.predict(X_test)

r2 = r2_score(y_test, y_pred)

In [59]:
r2

0.3976489924791492