<!-- @format -->

# First Assignment - IMT2022043

# Regression Challenge


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

<!-- @format -->

## Read data


In [None]:
train_df = pd.read_csv("train.csv")
train_df

## Preprocessing

<!-- @format -->

### 1. Remove Duplicates


In [None]:
train_df.shape

In [None]:
train_df.drop_duplicates(inplace=True)
train_df.shape

<!-- @format -->

### 2. Deal with NULL values


In [None]:
train_df.isna().sum()
# isna().sum() gives total number of missing values in each column.Same as isnull().sum()

In [None]:
null_value_percentages = (train_df.isna().sum() / train_df.shape[0]) * 100
null_value_percentages

<!-- @format -->

labels -> rows , columns -> columns


<!-- @format -->

### Exploratory Data Analysis(EDA)


<!-- @format -->

Here we are doing EDA so that while imputing our values are not too off because of some high outlier value


In [None]:
train_df.describe()

<!-- @format -->

Now,I will analyse data for look for potential outliers, but first convert Feature 2 into numerical column


In [None]:
feature2_df = train_df["Feature2"]
for i in train_df["Feature2"]:
    if i == True:
        feature2_df = feature2_df.replace(i, 1)
    else:
        feature2_df = feature2_df.replace(i, 0)
train_df["Feature2"] = feature2_df
train_df

<!-- @format -->

Now we will create boxplot for finding outliers


In [None]:
for i in train_df.columns:
    sns.boxplot(train_df[i])
    plt.show()

<!-- @format -->

Now our data is ready to impute the data as mean median will be similar due to outlier removal


<!-- @format -->

#### Strategy for removing NULL values:

Because Feature1 is 3.75% and Feature4 is 5.41%.We will impute here


In [None]:
columns_to_impute = null_value_percentages[null_value_percentages > 0].keys()
columns_to_impute

<!-- @format -->

Check once again the mean median before imputing and decide with what value to impute


In [None]:
train_df.describe().T

In [None]:
train_df

In [None]:
for col in columns_to_impute:
    train_df[col] = train_df[col].fillna(train_df[col].mean())

train_df.isna().sum()

<!-- @format -->

On looking at the above code, we see Feature3 has a lot of outliers, so we remove them now using upper and lower whiskers


In [None]:
class outlierremoval:
    def __init__(self, col):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        inter_quartile_range = q3 - q1
        self.upper_whisker = q3 + 1.75 * inter_quartile_range
        self.lower_whisker = q1 - 1.25 * inter_quartile_range

    def remove_outliers(self, row):
        if row <= self.upper_whisker and row >= self.lower_whisker:
            return row
        # elif row > self.upper_whisker:
        #     return self.upper_whisker
        # else:
        #     return self.lower_whisker

In [None]:
remover = outlierremoval(train_df["Feature3"])
train_df["Feature3"] = train_df["Feature3"].apply(remover.remove_outliers)
sns.boxplot(train_df["Feature3"])

In [None]:
# Remove all the cells which have null value
train_df.dropna(inplace=True)
train_df

<!-- @format -->

## Scaling and Standardization


In [None]:
for i in train_df.columns:
    sns.scatterplot(train_df[i])
    plt.show()

<!-- @format -->

On observing, we see Feature 2 and other columns vary by power of 2, so will normalize it.
Also ,lets normalize other columns also to (-1,1)


In [None]:
from sklearn.preprocessing import StandardScaler

<!-- @format -->

Categorical variables should not be normalized as they have a fixed value corresponding to a option.So, we move the categorical variables to the left and apply loop on non-categorical variables.


In [None]:
train_df_Feature2 = train_df["Feature2"]
train_df = train_df.drop(["Feature2"], axis=1)
train_df = pd.concat(
    [train_df_Feature2, train_df],
    axis=1,
)
train_df

In [None]:
sns.scatterplot(train_df["Feature3"])
plt.show()

<!-- @format -->

### EDA continued


<!-- @format -->

Lets see the hisplot for all features and understand which regression works best here.


In [None]:
for i in train_df.columns:
    sns.histplot(train_df[i])
    plt.show()

<!-- @format -->

## Making the Model itself


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline

<!-- @format -->

My roll no is IMT2022043, so random seed will be 43.


In [None]:
Label_df = train_df["Label"]
train_df = train_df.drop(["Label"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df, Label_df, test_size=0.2, random_state=1
)

In [None]:
X_train

### Making a automated Pipelne

In [None]:
# Alpha range - 0 to dont know
pipe = make_pipeline(PolynomialFeatures(degree=3), StandardScaler(), Lasso(alpha=0.9))

In [None]:
pipe.fit(X_train, y_train)
Y_poly_Predict = pipe.predict(X_test)
print("R2 Score: ", r2_score(y_test, Y_poly_Predict))

In [None]:
# Read test.csv
test_df = pd.read_csv("test.csv")
test_df

In [None]:
test_df['Feature2'] = test_df['Feature2'].replace({True: 1, False: 0})
test_df = test_df.drop('id' , axis=1)
test_df

In [None]:
# Swap Feature 2 and Feature1
test_df_Feature2 = test_df['Feature2']
test_df = test_df.drop(['Feature2'] , axis=1)
test_df = pd.concat([test_df_Feature2 , test_df] , axis=1)
test_df

#### Predict based on the above pipeline

In [None]:
y_test_pred_Ridge = pipe.predict(test_df)
# Write the predictions to a csv file, with first column as id
submission = pd.DataFrame()
submission["id"] = range(0, len(y_test_pred_Ridge))
submission["Label"] = y_test_pred_Ridge
submission.to_csv("submission_poly_lasso.csv", index=False)