### **Linear Regression**

* Linear Regression is one of the most fundamental and widely used algorithms for regression tasks.
*  It is a supervised learning algorithm that models the relationship between a dependent (target) variable and one or more independent (predictor) variables by fitting a linear equation to observed data.

**Step1 : Import Necessary Libraries**

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import xlrd

**Step2 : Load the Dataset**

In [2]:
df = pd.read_excel("E:\\Machine Learning\\global_superstore\\Global Superstore.xls")

In [3]:
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,City,State,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Shipping Cost,Order Priority
0,32298,CA-2012-124891,2012-07-31,2012-07-31,Same Day,RH-19495,Rick Hansen,Consumer,New York City,New York,...,TEC-AC-10003033,Technology,Accessories,Plantronics CS510 - Over-the-Head monaural Wir...,2309.65,7,0.0,762.1845,933.57,Critical
1,26341,IN-2013-77878,2013-02-05,2013-02-07,Second Class,JR-16210,Justin Ritter,Corporate,Wollongong,New South Wales,...,FUR-CH-10003950,Furniture,Chairs,"Novimex Executive Leather Armchair, Black",3709.395,9,0.1,-288.765,923.63,Critical
2,25330,IN-2013-71249,2013-10-17,2013-10-18,First Class,CR-12730,Craig Reiter,Consumer,Brisbane,Queensland,...,TEC-PH-10004664,Technology,Phones,"Nokia Smart Phone, with Caller ID",5175.171,9,0.1,919.971,915.49,Medium
3,13524,ES-2013-1579342,2013-01-28,2013-01-30,First Class,KM-16375,Katherine Murray,Home Office,Berlin,Berlin,...,TEC-PH-10004583,Technology,Phones,"Motorola Smart Phone, Cordless",2892.51,5,0.1,-96.54,910.16,Medium
4,47221,SG-2013-4320,2013-11-05,2013-11-06,Same Day,RH-9495,Rick Hansen,Consumer,Dakar,Dakar,...,TEC-SHA-10000501,Technology,Copiers,"Sharp Wireless Fax, High-Speed",2832.96,8,0.0,311.52,903.04,Critical


In [4]:
df.shape

(51290, 24)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51290 entries, 0 to 51289
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Row ID          51290 non-null  int64         
 1   Order ID        51290 non-null  object        
 2   Order Date      51290 non-null  datetime64[ns]
 3   Ship Date       51290 non-null  datetime64[ns]
 4   Ship Mode       51290 non-null  object        
 5   Customer ID     51290 non-null  object        
 6   Customer Name   51290 non-null  object        
 7   Segment         51290 non-null  object        
 8   City            51290 non-null  object        
 9   State           51290 non-null  object        
 10  Country         51290 non-null  object        
 11  Postal Code     9994 non-null   float64       
 12  Market          51290 non-null  object        
 13  Region          51290 non-null  object        
 14  Product ID      51290 non-null  object        
 15  Ca

**Step3 : Preprocess the Dataset**

In [6]:
df.isnull().sum()

Row ID                0
Order ID              0
Order Date            0
Ship Date             0
Ship Mode             0
Customer ID           0
Customer Name         0
Segment               0
City                  0
State                 0
Country               0
Postal Code       41296
Market                0
Region                0
Product ID            0
Category              0
Sub-Category          0
Product Name          0
Sales                 0
Quantity              0
Discount              0
Profit                0
Shipping Cost         0
Order Priority        0
dtype: int64

In [7]:
df.drop(columns = ['Postal Code'], inplace = True)

In [8]:
df.isnull().sum()

Row ID            0
Order ID          0
Order Date        0
Ship Date         0
Ship Mode         0
Customer ID       0
Customer Name     0
Segment           0
City              0
State             0
Country           0
Market            0
Region            0
Product ID        0
Category          0
Sub-Category      0
Product Name      0
Sales             0
Quantity          0
Discount          0
Profit            0
Shipping Cost     0
Order Priority    0
dtype: int64

In [9]:
df_num = df.select_dtypes(include = [np.number])
df_cat = df.select_dtypes(include = ['object'])

In [10]:
print(df_num.columns)
print("-----------------------------------------------------")
print(df_cat.columns)

Index(['Row ID', 'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost'], dtype='object')
-----------------------------------------------------
Index(['Order ID', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment',
       'City', 'State', 'Country', 'Market', 'Region', 'Product ID',
       'Category', 'Sub-Category', 'Product Name', 'Order Priority'],
      dtype='object')


**Step 4 : Correlationship between the columns**

In [11]:
df_num_corr = df_num.corr()

In [12]:
df_num_columns = []
df_num_columns.extend(df_num_corr[(df_num_corr["Profit"]>0.3)].index.values)
df_num_columns.extend(df_num_corr[(df_num_corr["Profit"]<-0.3)].index.values)

In [13]:
df_num_columns

['Sales', 'Profit', 'Shipping Cost', 'Discount']

In [14]:
from scipy.stats import f_oneway

In [15]:
df_cat['P'] = df_num['Profit']

In [16]:
groups = [df_cat['P'][df_cat['Market'] == category] for category in df_cat['Market'].unique()]

f_stat,  p_value = f_oneway(*groups)
print(f"F-statistic : {f_stat}, p-value : {p_value}")

F-statistic : 28.07818874959561, p-value : 1.0834601137634787e-33


In [17]:
influence_list = []
noninfluence_list = []
for influence1 in list (df_cat.columns):
    if influence1 == 'P':
        continue
    else:
        groups = [df_cat['P'][df_cat[influence1] == category] for category in df_cat[influence1].unique()]
        f_stat, p_value = f_oneway(*groups)
        print(f"column : {influence1}, F-statistic: {f_stat}, P-value: {p_value}")
        if p_value < 0.05:
            influence_list.append(influence1)
        else:
            noninfluence_list.append(influence1)

column : Order ID, F-statistic: 1.0775148134190364, P-value: 1.1350594713462918e-09
column : Ship Mode, F-statistic: 0.1112694400346025, P-value: 0.953549120213493
column : Customer ID, F-statistic: 1.1374918958952611, P-value: 0.00012663690762868544
column : Customer Name, F-statistic: 1.1538436569505661, P-value: 0.0017954686365357188
column : Segment, F-statistic: 0.22119237404323483, P-value: 0.8015632303031714
column : City, F-statistic: 2.164422870379703, P-value: 1.9283854765863786e-276
column : State, F-statistic: 4.709630866402644, P-value: 0.0
column : Country, F-statistic: 22.25802770920221, P-value: 0.0
column : Market, F-statistic: 28.07818874959561, P-value: 1.0834601137634787e-33
column : Region, F-statistic: 34.26121568235655, P-value: 3.617471983600004e-80
column : Product ID, F-statistic: 4.007224887155522, P-value: 0.0
column : Category, F-statistic: 304.50613538510225, P-value: 3.4173111634965594e-132
column : Sub-Category, F-statistic: 103.91551183097653, P-value: 

In [18]:
influence_list

['Order ID',
 'Customer ID',
 'Customer Name',
 'City',
 'State',
 'Country',
 'Market',
 'Region',
 'Product ID',
 'Category',
 'Sub-Category',
 'Product Name']

In [19]:
noninfluence_list

['Ship Mode', 'Segment', 'Order Priority']

In [20]:
df_cat1 = df_cat[influence_list]

In [21]:
df_cat1.columns

Index(['Order ID', 'Customer ID', 'Customer Name', 'City', 'State', 'Country',
       'Market', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name'],
      dtype='object')

**Step5 : Encoding the categorical columns**

In [22]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_cat_e = df_cat1.apply(le.fit_transform)
df_cat_e.head()

Unnamed: 0,Order ID,Customer ID,Customer Name,City,State,Country,Market,Region,Product ID,Category,Sub-Category,Product Name
0,1499,1286,632,2290,703,139,6,6,8246,2,0,2750
1,13063,808,413,3518,702,6,0,9,907,0,5,2525
2,12983,336,181,497,820,6,0,9,10157,2,13,2502
3,6813,873,424,375,145,47,4,3,10146,2,13,2414
4,21702,1290,632,857,270,110,1,0,10249,2,6,3158


**Step 6 : Removing Outliers of the Datacolumns**

In [23]:
def remove_outliers_iqr(df,columns):
    for col in columns :
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)

        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

In [24]:
df_num.columns

Index(['Row ID', 'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost'], dtype='object')

In [25]:
columns_to_check =['Row ID', 'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost']
df_no_outliers = remove_outliers_iqr(df_num, columns_to_check)

print("Original DataFrame")
print(df_num.shape)
print("DataFrame after Outlier Treatment")
print(df_no_outliers.shape)

Original DataFrame
(51290, 6)
DataFrame after Outlier Treatment
(30879, 6)


**Step 7 : Scale the numerical columns**

In [26]:
from sklearn.preprocessing import StandardScaler

In [27]:
columns_to_scale = ['Row ID', 'Sales', 'Quantity', 'Discount', 'Profit', 'Shipping Cost']
standard_scaler = StandardScaler()
df_standard_scaled = df_no_outliers.copy()
df_standard_scaled[columns_to_scale] = standard_scaler.fit_transform(df_no_outliers[columns_to_scale])

In [28]:
final_df=pd.concat([df_standard_scaled,df_cat_e],axis=1)

In [29]:
final_df.head()

Unnamed: 0,Row ID,Sales,Quantity,Discount,Profit,Shipping Cost,Order ID,Customer ID,Customer Name,City,State,Country,Market,Region,Product ID,Category,Sub-Category,Product Name
12068,0.55465,1.208948,-0.530879,0.734589,1.419094,3.27575,1918,1364,719,1400,982,139,6,3,8087,2,0,2290
12069,0.759078,1.760251,0.718684,1.406908,-2.778887,3.27575,1662,1535,757,3508,273,139,6,6,1982,0,16,2218
12073,-0.971982,2.699341,0.718684,-0.61005,-0.193081,3.272457,7802,534,248,3431,129,119,4,10,2742,1,1,1908
12074,0.351571,1.517718,0.718684,2.079228,-0.129688,3.272457,14425,591,304,2119,1033,6,0,9,10112,2,13,2485
12078,0.426671,0.800576,1.968248,0.734589,1.880483,3.269164,539,1259,663,772,742,139,6,6,4445,1,7,3471


**Step 8 : Split the data column and train the model**

In [41]:
# Drop rows with any missing values
df_clean = final_df.dropna()

# Separate features and target again
X = df_clean.drop(columns=['Profit', 'Product Name'])
y = df_clean['Profit']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

print(f"R-squared: {r2}")


R-squared: 0.4043123623513718
