# AI-511 Machine Learning Project 2022-23

## Imports for the Notebook

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

import nltk
from nltk.corpus import stopwords

import re
import string
import time
import random
from random import uniform

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection._univariate_selection import SelectKBest, f_regression
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
import xgboost as xgb
from sklearn.svm import SVR
import lightgbm as LGBM
from sklearn.neural_network import MLPRegressor


from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from scipy.sparse import vstack
from scipy.stats import randint as sp_randint



STOPWORDS = set(stopwords.words('english'))

## Load the Dataset

In [None]:
train_df = pd.read_csv("/kaggle/input/snapit-always-get-the-best-price/train.csv")
train_df.head()

In [None]:
test_df = pd.read_csv("/kaggle/input/snapit-always-get-the-best-price/test.csv")
test_df.head()

## 1. Preprocessing the Data

### 1.1 Partitioning `CATEGORY` column

We can observe that a data point's `CATEGORY` is of the form `CATEGORY_1/CATEGORY_2/.../CATEGORY_5`. For further application of models, we divide the feature into 5 sub-divisions.

In [None]:
def divideCategory(df):
    df[['CATEGORY_1','CATEGORY_2','CATEGORY_3','CATEGORY_4','CATEGORY_5']] = df.CATEGORY.str.split("/",expand=True)

def dropCol(df, col):
    df.drop(axis = "columns", labels=col, inplace = True)

In [None]:
divideCategory(train_df)
dropCol(train_df, "CATEGORY")
train_df.head()

In [None]:
divideCategory(test_df)
dropCol(test_df, "CATEGORY")
test_df.head()

### 1.2 Neutralise the Null values

NULL values occur whenever there is no value given for a data point's particular feature. We must either eliminate/replace these null values.

In [None]:
# null values of each column
train_df.isna().sum()

In [None]:
def replaceNulls(df, col, const):
    df[col].fillna(const, inplace=True)

In [None]:
null_valued_columns = ["PRODUCT_BRAND", "PRODUCT_DESCRIPTION", "CATEGORY_1", "CATEGORY_2", "CATEGORY_3", "CATEGORY_4", "CATEGORY_5"]
for column in null_valued_columns:
    replaceNulls(train_df, column, "none")

train_df.head()

In [None]:
train_df.isna().sum()

As we can see, there are no more NULLs left. The process is same for test data

In [None]:
test_df.isna().sum()

Replace the null values

In [None]:
# they are the same as in train_df
for column in null_valued_columns:
    replaceNulls(test_df, column, "none")

test_df.head()

### 1.3 Useful Feature Extraction

This section removes the unwanted features from the data based on their correlation with the target feature

Pearson Coeffecient is used to calculate correlation between two "continous" features. The only continous features in the data are **PRODUCT_ID** and **PRODUCT_PRICE**.

In [None]:
train_df[["PRODUCT_ID", "PRODUCT_PRICE"]].corr()

We can observe that correlation of **PRODUCT_ID** with **PRODUCT_PRICRE** is very low. Thus, this column does not affect much the data. So, we can remove the feature. 

In [None]:
dropCol(train_df, "PRODUCT_ID")
train_df.head()

In [None]:
# saved for documenting predicted product prices
test_product_id = test_df["PRODUCT_ID"]
dropCol(test_df, "PRODUCT_ID")

In [None]:
test_df.head()

### 1.4 Preprocess text features

**Text Features**: Features where the values are textual (typically not a single word). In our case `PRODUCT_NAME` and `PRODUCT_DESCRIPTION`. <br>
<br>
To apply any model, all the features must be in numerical form. We use NLP(Natural Language Processing) to convert these Text features into Numerical features. For a better performance, we do a small pre-processinf to these text features

In [None]:
# replaces the general english shortcuts with their full forms
def replace_shortcuts(sentence):

    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"won't", "will not", sentence)
    sentence = re.sub(r"can\'t", "can not", sentence)
        
    return sentence

# removes the punctuation marks
def remove_punctuation(sentence):
    
    regular_punct = list(string.punctuation)
    for punc in regular_punct:
        if punc in sentence:
            sentence = sentence.replace(punc, ' ')

    return sentence.strip()

# remove (if any) emoji's (like :) )
def remove_emoji(sentence):
    
    pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    return pattern.sub(r'', sentence)

In [None]:
# pre processing Text features
def preprocess_text_feature(feature):
    processed_feature = []
    for sentence in feature.values:
            sent = replace_shortcuts(sentence)
            sent = sent.replace('\\r', ' ')
            sent = sent.replace('\\"', ' ')
            sent = sent.replace('\\n', ' ')
            sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
            sent = remove_emoji(sent)
            sent = remove_punctuation(sent)
            sent = ' '.join(e for e in sent.split() if e not in STOPWORDS)
            processed_feature.append(sent.lower().strip())
    return processed_feature

In [None]:
text_features = ["PRODUCT_NAME", "PRODUCT_DESCRIPTION"]
for feature in text_features:
    train_df[feature] = preprocess_text_feature(train_df[feature])
    test_df[feature] = preprocess_text_feature(test_df[feature])

### 1.5 Split for CROSS-VALIDATION

Cross-validation is a techinnque used to verify the goodness of a model. We can tune the hyper parameters and assess the effeciency of the model by testing on the cross-validated (cv_df).

In [None]:
# log1p(x) = log(x+1): useful while calculating RMSLE
y_tr = np.log1p(train_df["PRODUCT_PRICE"])
train_df, cv_df , y_train, y_cv = train_test_split(train_df, y_tr, test_size=0.1, random_state=42)
cv_df.head()

### 1.6 Categorical to Numerical

In this section, we convert all categorical features into numerical features <br>
For `CATEGORY_i`, `PRODUCT_BRAND`: `CountVecotrizer()` - Numeric Encoder<br>
For `PRODUCT_NAME`. `PRODUCT_DESCRIPTION`: `TfidfVectorizer()` - Text Encoder

In [None]:
train_set = {}
cv_set = {}
test_set = {}

In [None]:
class TextEncoder:

    def __init__(self, train_df, cv_df, test_df):
        self.train_df = train_df
        self.test_df = test_df
        self.cv_df = cv_df
    
    def TextEncode(self, column):
        tfidfvectorizer = TfidfVectorizer()
        train_te = tfidfvectorizer.fit_transform(self.train_df[column])
        cv_te = tfidfvectorizer.transform(self.cv_df[column])
        test_te = tfidfvectorizer.transform(self.test_df[column])
        return (train_te, cv_te, test_te)

features_to_te = ["PRODUCT_NAME", "PRODUCT_DESCRIPTION"]
TextEncoder = TextEncoder(train_df, cv_df, test_df)
for feature in features_to_te:
    (train_set[feature], cv_set[feature], test_set[feature]) = TextEncoder.TextEncode(feature)

In [None]:
class OneHotEncoder:
    
    def __init__(self, train_df, cv_df, test_df):
        self.train_df = train_df
        self.test_df = test_df
        self.cv_df = cv_df

    def OneHotEncode(self, column):
        countvectorizer = CountVectorizer()
        train_ohe = countvectorizer.fit_transform(self.train_df[column])
        cv_ohe = countvectorizer.transform(self.cv_df[column])
        test_ohe = countvectorizer.transform(self.test_df[column])
        return (train_ohe, cv_ohe, test_ohe)

features_to_ohe = ["CATEGORY_1", "CATEGORY_2", "CATEGORY_3", "CATEGORY_4", "CATEGORY_5", "PRODUCT_BRAND"]
OneHotEncoder = OneHotEncoder(train_df, cv_df, test_df)
for feature in features_to_ohe:
    (train_set[feature], cv_set[feature], test_set[feature]) = OneHotEncoder.OneHotEncode(feature)

In [None]:
other_features = ["PRODUCT_CONDITION", "SHIPPING_AVAILABILITY"]
for feature in other_features:
    train_set[feature] = csr_matrix(pd.get_dummies(train_df[feature], sparse=True).values)
    cv_set[feature] = csr_matrix(pd.get_dummies(cv_df[feature], sparse=True).values)
    test_set[feature] = csr_matrix(pd.get_dummies(test_df[feature], sparse=True).values)

Merging all the csr matrices into a single csr matrix

In [None]:
from scipy.sparse import hstack
def mergeAll(set):
    tup = tuple(list(set.values()))
    merged = hstack(tup).tocsr().astype('float32')
    return merged

In [None]:
X_train = mergeAll(train_set)
X_cv = mergeAll(cv_set)
X_test = mergeAll(test_set)

### 1.7 Normalisation

In [None]:
# class Normalizer:
    
#     def __init__(self, feature):
#         self.feature = feature
#         self.min = feature.min()
#         self.max = feature.max()

#     def normalize(self):
#         normalized_feature = (self.feature - self.min)/(self.max - self.min)
#         return normalized_feature
    
#     def denormalize(self, normalized_feature):
#         denormalized_feature = (normalized_feature * (self.max - self.min)) + self.min
#         return denormalized_feature

In [None]:
# TrainPriceNormalizer = Normalizer(y_train)
# y_train = TrainPriceNormalizer.normalize()

# CVPriceNormalizer = Normalizer(y_cv)
# y_cv = CVPriceNormalizer.normalize()

In [None]:
# y_train = TrainPriceNormali/zer.denormalize(y_train)

### 1.8 Outlier Detection and Removal

This section of Preprocessing checks for outliers and remove them (if present) to avoid misleading the model

In [None]:
sns.boxplot(x = train_df["PRODUCT_CONDITION"])
plt.show()

If we observe the above boxplot, there are no points outside the whiskers. Hence, there are no outlier points for the PRODUCT_CONDITION column. By checking for every feature in this way, we found no outliers. 

## 2. Regression Models

By now we had completely pre-processed train, val and test data. We will be training different Regression models using train data and test them across Val data to assess the model. Then we predict the price of test data for the final submission

In [None]:
def convertToCSV(product_price, product_id, filename):
    test_product_id = list(product_id)
    test_predicted_price = list(product_price)

    data = {'PRODUCT_ID': product_id,
        'PRODUCT_PRICE': product_price}

    pd.DataFrame(data).to_csv(filename+".csv", index=False)

In [None]:
X_train_cv = vstack((X_train, X_cv)).tocsr().astype('float32')
y_train_cv = list(y_train) + list(y_cv)

### 2.1 Linear Regression

First, we use the naive Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

start = time.time()
LR_model = LinearRegression()
LR_model.fit(X_train_cv, y_train_cv)
end = time.time()

duration = round(end-start, 2)
print("Training time: " + str(duration) + " secs")

In [None]:
predicted_test_y_LR = LR_model.predict(X_test)

### 2.2 Ridge Regression

In this section, we apply Ridge Regression model on our data. Initially, we tune the hyperparam alpha for a best model using cross-validation. The method of error-check used is RMSLE

In [None]:
# cross-validation for best alpha

alpha = [1.7, 2.1, 2.3, 3] 
cv_rmsle_array=[]
for i in alpha:
    model = Ridge(solver="sag", random_state=42, alpha=i)
    model.fit(X_train_cv, y_train_cv)
    y_pred_cv = model.predict(X_cv)
    cv_rmsle_array.append(sqrt(mse(y_pred_cv, y_cv)))

best_alpha = np.argmin(cv_rmsle_array)
print("Best value for Alpha = " + str(alpha[best_alpha]))

In [None]:
from sklearn.linear_model import Ridge

start = time.time()
RR_model = Ridge(solver = "sag", random_state =42, alpha = alpha[best_alpha])
RR_model.fit(X_train_cv, y_train_cv)
end = time.time()

duration = round(end-start, 2)
print("Training time: " + str(duration) + " secs")

In [None]:
y_train_cv_pred_RR = RR_model.predict(X_train_cv)
y_train_pred_RR = RR_model.predict(X_train)
y_cv_pred_RR = RR_model.predict(X_cv)
y_test_pred_RR = RR_model.predict(X_test)

print("Error in Train+CV: " + str(sqrt(mse(y_train_cv, y_train_cv_pred_RR))))
print("Error in Train: " + str(sqrt(mse(y_train, y_train_pred_RR))))
print("Error in CV: " + str(sqrt(mse(y_cv, y_cv_pred_RR))))

In [None]:
convertToCSV(np.expm1(y_test_pred_RR), test_product_id, "output_RR")

### 2.3 XGBoost

In [None]:
start = time.time()
XGBR_model = xgb.XGBRegressor(n_estimators=100, eta=0.8)
XGBR_model.fit(X_train_cv,y_train_cv)
end = time.time()

duration = round((end-start)/60.0, 2)
print("Training time: " + str(duration) + "mins")

In [None]:
y_train_cv_XGB_pred = XGBR_model.predict(X_train_cv)
y_train_XGB_pred = XGBR_model.predict(X_train)
y_cv_XGB_pred = XGBR_model.predict(X_cv)
y_test_XGB_pred = XGBR_model.predict(X_test)

print("TrainCV: " + str(sqrt(mse(y_train_cv, y_train_cv_XGB_pred))))
print("Train: " + str(sqrt(mse(y_train, y_train_XGB_pred))))
print("CV: " + str(sqrt(mse(y_cv, y_cv_XGB_pred))))
y_test_XGB_final = np.expm1(y_test_XGB_pred)

In [None]:
test_product_id = list(test_product_id)
test_predicted_price = list(y_test_XGB_final)

data = {'PRODUCT_ID': test_product_id,
    'PRODUCT_PRICE': test_predicted_price}

pd.DataFrame(data).to_csv("output_XGB.csv", index=False)

### 2.4 Extracting k-important features
Coming models can't take all features..so we extratc imp features

In [None]:
numerical_keys = ["PRODUCT_CONDITION", "SHIPPING_AVAILABILITY"]
non_numeric_train = hstack(tuple([value for key, value in train_set.items() if key not in numerical_keys]))
non_numeric_cv    = hstack(tuple([value for key, value in cv_set.items() if key not in numerical_keys]))
non_numeric_test  = hstack(tuple([value for key, value in test_set.items() if key not in numerical_keys]))

In [None]:
max_features = 10000
feature_selector = SelectKBest(f_regression, k=max_features)

non_numeric_train_features = feature_selector.fit_transform(non_numeric_train, y_train)
non_numeric_cv_features    = feature_selector.transform(non_numeric_cv)
non_numeric_test_features  = feature_selector.transform(non_numeric_test)

In [None]:
X_train = hstack((non_numeric_train_features, train_set["PRODUCT_CONDITION"], train_set["SHIPPING_AVAILABILITY"]))
X_cv    = hstack((non_numeric_cv_features, cv_set["PRODUCT_CONDITION"], cv_set["SHIPPING_AVAILABILITY"]))
X_test  = hstack((non_numeric_test_features, test_set["PRODUCT_CONDITION"], test_set["SHIPPING_AVAILABILITY"]))
X_train_cv = vstack((X_train, X_cv))
X_train.shape

### 2.5 SVM

In [None]:

start = time.time()
SVR_model = SVR(C=3, max_iter=200)
SVR_model.fit(X_train, y_train)
end = time.time()

duration = round((end-start)/60.0, 2)
print("Training time: " + str(duration) + "mins")

In [None]:
y_train_pred_SVR = SVR_model.predict(X_train)
y_cv_pred_SVR = SVR_model.predict(X_cv)
y_test_pred_SVR = SVR_model.predict(X_test)

print("Error in Train: " + str(sqrt(mse(y_train, y_train_pred_SVR))))
print("Error in CV: " + str(sqrt(mse(y_cv, y_cv_pred_SVR))))

In [None]:
len(y_train_cv)

### 2.5 LightGBM

In [None]:
import lightgbm as LGBM

LGBR_model = LGBM.LGBMRegressor(learning_rate= 0.7, num_leaves = 30, n_estimators = 800,
                           min_child_samples = 20, subsample = 0.6, colsample_bytree = 0.6,  
                           n_jobs=-1 , silent = False)

start = time.time()
LGBR_model.fit(X_train_cv, y_train_cv)
end = time.time()

duration = round((end-start)/60.0, 2)
print("Training time: " + str(duration) + "mins")

In [None]:
y_train_cv_pred_LGBR = LGBR_model.predict(X_train_cv)
y_train_pred_LGBR = LGBR_model.predict(X_train)
y_cv_pred_LGBR = LGBR_model.predict(X_cv)
y_test_pred_LGBR = LGBR_model.predict(X_test)

print("Error in Train-CV: " + str(sqrt(mse(y_train_cv, y_train_cv_pred_LGBR))))
print("Error in Train: " + str(sqrt(mse(y_train, y_train_pred_LGBR))))
print("Error in CV: " + str(sqrt(mse(y_cv, y_cv_pred_LGBR))))

In [None]:
convertToCSV(np.expm1(y_test_pred_LGBR), test_product_id, "output_LGBR")

### 2.6 MLP

In [None]:
from sklearn.neural_network import MLPRegressor

start = time.time()
MLPR_model = MLPRegressor(random_state=1, max_iter=500D).fit(X_train_cv, y_train_cv)
end = time.time()

duration = round((end-start)/60.0, 2)
print("Training time: " + str(duration) + "mins")

In [None]:
y_train_cv_pred_MLPR = MLPR_model.predict(X_train_cv)
y_train_pred_MLPR = MLPR_model.predict(X_train)
y_cv_pred_MLPR    = MLPR_model.predict(X_cv)
y_test_pred_MLPR  = MLPR_model.predict(X_test)

print("Train-CV Error: " + str(sqrt(mse(y_train_cv, y_train_cv_pred_MLPR))))
print("Train Error: " + str(sqrt(mse(y_train, y_train_pred_MLPR))))
print("CV: " + str(sqrt(mse(y_cv, y_cv_pred_MLPR))))

In [None]:
convertToCSV(np.expm1(y_test_pred_MLPR), test_product_id, "output_MLP")

2.8 Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

start = time.time()
LaR_model = Lasso(alpha = 1)
LaR_model.fit(X_train_cv, y_train_cv)
end = time.time()

duration = round((end-start)/60.0, 2)
print("Training time: " + str(duration) + "mins")

By observing on various models, given the resources of GPU, we observed best accuracy(F1-score) of **0.7** for the LightGBM model

*TEAM: <br>
N V Sai Likhith - IMT2020118 <br>
T Akhil - IMT2020124*