# COSC 74 Final Project - Amazon Review Multiclass Classification
## Nikhil Pande and Colton Sankey

In [617]:
!pip install vaderSentiment
!pip install nltk



In [651]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, LabelEncoder, StandardScaler
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer

import re # for regex

# NLTK packages
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer


In [652]:
# Load the dataset
train_data = pd.DataFrame(pd.read_csv('amazon_train.csv'))
test_data = pd.DataFrame(pd.read_csv('amazon_test.csv'))

y_train = train_data['overall']
train_data = train_data[['reviewText', 'summary', 'verified', 'vote', 'image', 'unixReviewTime']]
test_id = test_data['id'] # for the final submission
test_data = test_data[['reviewText', 'summary', 'verified', 'vote', 'image', 'unixReviewTime']]


### Preprocess and Combine Text

In [653]:
# Data Preprocessing
# Handle the missing values in the summary and vote columns
train_data['summary'] = train_data['summary'].fillna("") # a few na summary rows were messing with the combined text column
test_data['summary'] = test_data['summary'].fillna("")
train_data['vote'] = train_data['vote'].fillna(0.0)
test_data['vote'] = test_data['vote'].fillna(0.0)

# put 0 (no) or 1 (yes) for whether or not there is an image
train_data['image'] = train_data['image'].notna().astype(int)
test_data['image'] = test_data['image'].notna().astype(int)


# Create 'combined_text'
train_data['combined_text'] = train_data['reviewText'] + ' ' + train_data['summary']
test_data['combined_text'] = test_data['reviewText'] + ' ' + test_data['summary']


# Function to clean text
def clean_text(text):
    # Replace whitespace with a single space
    cleaned_text = re.sub(r'\s+', ' ', text)
    # Replace ":)" with "happy" in 'reviewText' and 'summary' columns
    text.replace(":)", "happy")
    return cleaned_text

train_data['reviewText'] = train_data['reviewText'].apply(clean_text)
test_data['reviewText'] = test_data['reviewText'].apply(clean_text)

train_data['summary'] = train_data['summary'].apply(clean_text)
test_data['summary'] = test_data['summary'].apply(clean_text)


train_data.head(5)

Unnamed: 0,reviewText,summary,verified,vote,image,unixReviewTime,combined_text
0,all of the reviews for this product are fake.,"All fake reviews, beware.",False,2.0,0,1478908800,all of the reviews for this product are fake. ...
1,wrong part. our fault.,One Star,True,0.0,0,1480982400,wrong part. our fault. One Star
2,this wire set it really sucks!!!,One Star,True,0.0,0,1410912000,this wire set it really sucks!!! One Star
3,"first use, it leaked instantly. even at 5 buck...",One Star,True,0.0,0,1465603200,"first use, it leaked instantly. even at 5 buck..."
4,didn't fit,One Star,True,0.0,0,1513987200,didn't fit One Star


In [654]:
# Mapping between written-out numbers and numeric values
written_numbers_mapping = {
    'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
    'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10
}

# Function to convert written-out numbers to numeric
def convert_written_number(word):
    if pd.isna(word):
        return word
    return written_numbers_mapping.get(word.lower(), word)

train_data['explicit_star_count'] = train_data['summary'].str.extract(r'(\w+)\s*star[s]*', flags=re.IGNORECASE)
test_data['explicit_star_count'] = test_data['summary'].str.extract(r'(\w+)\s*star[s]*', flags=re.IGNORECASE)

train_data['explicit_star_count'] = train_data['explicit_star_count'].apply(convert_written_number)
test_data['explicit_star_count'] = test_data['explicit_star_count'].apply(convert_written_number)




In [655]:
# Check if 'ok' or 'okay' is present in the 'summary' column
train_data['contains_okay'] = train_data['summary'].str.contains(r'\bok(?:ay)?\b', case=False)
test_data['contains_okay'] = test_data['summary'].str.contains(r'\bok(?:ay)?\b', case=False)

# Set 'expected_star_count' to 3 if 'contains_okay' is True, otherwise keep the existing value
train_data['explicit_star_count'] = np.where(train_data['contains_okay'], 3, train_data['explicit_star_count'])
test_data['explicit_star_count'] = np.where(test_data['contains_okay'], 3, test_data['explicit_star_count'])

# Drop the temporary column 'contains_okay' if you don't need it
train_data = train_data.drop('contains_okay', axis=1)
test_data = test_data.drop('contains_okay', axis=1)

In [656]:
# NLTK-based feature engineering
ps = PorterStemmer()

# Count of tokens
# Tokenize, remove stopwords, and perform stemming
train_data['tokens'] = train_data['combined_text'].apply(lambda x: [ps.stem(word) for word in word_tokenize(x)])
test_data['tokens'] = test_data['combined_text'].apply(lambda x: [ps.stem(word) for word in word_tokenize(x)])
# token_count is the length of the review
train_data['token_count'] = train_data['tokens'].apply(len)
test_data['token_count'] = test_data['tokens'].apply(len)

# drop tokens so only word count remains
train_data = train_data.drop('tokens', axis = 1)
test_data = test_data.drop('tokens', axis = 1)

### Sentiment Analysis

In [658]:
# Function to get sentiment analysis
def sentiment_analysis(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(text)
    return sentiment_score['compound'], sentiment_score['neg'], sentiment_score['pos']
#     return sentiment_score


# 2
# Temporary DataFrames for 'reviewText'
sentiment_features_train_r = train_data['combined_text'].apply(sentiment_analysis).apply(lambda x: pd.Series(x, index=['compound_r', 'neg_r', 'pos_r']))
sentiment_features_test_r = test_data['combined_text'].apply(sentiment_analysis).apply(lambda x: pd.Series(x, index=['compound_r', 'neg_r', 'pos_r']))

# Concatenate the new DataFrames with the original DataFrames
train_data = pd.concat([train_data, sentiment_features_train_r], axis=1)
test_data = pd.concat([test_data, sentiment_features_test_r], axis=1)


In [106]:
train_data[train_data['reviewText'].str.contains(r'\bstar[s]?\b', case=False)].to_csv("stars.csv", index=False)

### NLTK: Count tokens and exclamation points (optional)

In [659]:
# Count exclamation marks!

train_data['num_exclamation_marks'] = train_data['combined_text'].str.count('!')
test_data['num_exclamation_marks'] = test_data['combined_text'].str.count('!')

train_data['trust'] = train_data['compound_r'] * (train_data['vote'] ** 2)
test_data['trust'] = test_data['compound_r'] * (test_data['vote'] ** 2)

train_data['emphasis'] = train_data['num_exclamation_marks'] * train_data['compound_r']
test_data['emphasis'] = test_data['num_exclamation_marks'] * test_data['compound_r']

In [660]:
# Label encoding for 'verified' column

#LABEL ENCODER
label_encoder = LabelEncoder()
train_data['verified'] = label_encoder.fit_transform(train_data['verified'])
test_data['verified'] = label_encoder.transform(test_data['verified'])



In [661]:

train_data = train_data.drop('combined_text', axis = 1)
test_data = test_data.drop('combined_text', axis = 1)

### Split

In [662]:
# Split into training and validation
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y_train, test_size = 0.2, random_state = 0)
y_train_actual = pd.DataFrame(pd.read_csv("amazon_train.csv"))['overall']

In [663]:
# Convert the 'explicit_star_count' column to numeric
X_train['explicit_star_count'] = pd.to_numeric(X_train['explicit_star_count'], errors='coerce')
X_valid['explicit_star_count'] = pd.to_numeric(X_valid['explicit_star_count'], errors='coerce')
test_data['explicit_star_count'] = pd.to_numeric(test_data['explicit_star_count'], errors='coerce')


train_data['explicit_star_count'] = pd.to_numeric(train_data['explicit_star_count'], errors = 'coerce')

### Preprocess with TF-IDF Vectorizer

In [682]:
# Create a ColumnTransformer for preprocessing with tfidf
preprocessor = ColumnTransformer(
    transformers=[('text', TfidfVectorizer(max_features = 3000), 'reviewText'),
                 ('text2', TfidfVectorizer(max_features = 3000), 'summary')],
    remainder='passthrough'
)

# Apply the preprocessing to the training and test sets
X_train_pre = preprocessor.fit_transform(X_train)
X_valid_pre = preprocessor.transform(X_valid)
X_test_pre = preprocessor.transform(test_data)

In [683]:
# Missing values in explicit_star_count that need to be filled
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='constant', fill_value = -1, missing_values=np.nan)
X_train_pre_imputed = imputer.fit_transform(X_train_pre)
X_valid_pre_imputed = imputer.transform(X_valid_pre)
X_test_pre_imputed = imputer.transform(X_test_pre)

### Reporting Function

In [672]:
# Function to report the results of the predictions for future use
def report(y_actual, y_pred):
    conf_matrix = confusion_matrix(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, y_pred)
    macro_f1 = f1_score(y_actual, y_pred, average='macro')
    
    # Getting the roc auc score
    lb = LabelBinarizer()
    y_valid_bin = lb.fit_transform(y_actual)

    # Binarize the predicted labels
    y_pred_validlr_bin = lb.transform(y_pred)

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_valid_bin, y_pred_validlr_bin, multi_class='ovr')
    
    # print metrics
    print("Confusion Matrix:\n", conf_matrix)
    print("Accuracy:", accuracy)
    print("Macro F1 Score:", macro_f1)
    print(f'ROC AUC Score: {roc_auc}')

## Random Forest

In [688]:
# Define hyperparameter grid for GridSearchCV
param_gridrf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    # Add other hyperparameters to tune
}

# Use GridSearchCV for hyperparameter tuning
gridrf = GridSearchCV(RandomForestClassifier(), param_grid=param_gridrf, cv=3, scoring='f1_macro')
gridrf.fit(X_train_pre_imputed, y_train)

In [689]:
print(gridrf.best_params_)

{'max_depth': 30, 'n_estimators': 100}


In [690]:
rf = RandomForestClassifier(n_estimators = 100, max_depth = 30)
rf.fit(X_train_pre_imputed, y_train)

# Predict and find results
y_pred_validrf = rf.predict(X_valid_pre_imputed)

report(y_valid, y_pred_validrf)

Confusion Matrix:
 [[826 176  62  52  32]
 [430 489 151 111  32]
 [147 212 556 216  47]
 [ 52  63 161 661 234]
 [ 41  41  52 203 791]]
Accuracy: 0.5692017814319973
Macro F1 Score: 0.5663577797846359
ROC AUC Score: 0.7321741668079962


## Logistic Regression

In [434]:
param_gridlr = {
    'C': [1, 2, 3, 5, 10],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'lbfgs', 'newton-cg']
}

gridlr = GridSearchCV(LogisticRegression(), param_grid=param_gridlr, cv = 3, scoring = 'f1_macro', verbose = 3)
gridlr.fit(X_train_pre_imputed, pd.DataFrame(pd.read_csv("amazon_train.csv"))['overall'])

Fitting 3 folds for each of 45 candidates, totalling 135 fits
[CV 1/3] END .C=1, penalty=l1, solver=liblinear;, score=0.549 total time=   1.9s
[CV 2/3] END .C=1, penalty=l1, solver=liblinear;, score=0.599 total time=   1.6s
[CV 3/3] END .C=1, penalty=l1, solver=liblinear;, score=0.601 total time=   0.8s
[CV 1/3] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3] END ...C=1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/3] END ...C=1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 3/3] END ...C=1, penalty=l1, solver=newton-cg;, score=nan total time=   0.0s
[CV 1/3] END .C=1, penalty=l2, solver=liblinear;, score=0.544 total time=   1.1s
[CV 2/3] END .C=1, penalty=l2, solver=liblinear;, score=0.598 total time=   1.5s
[CV 3/3] END .C=1, penalty=l2, solver=liblinear

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END .....C=1, penalty=l2, solver=lbfgs;, score=0.412 total time=   1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END .....C=1, penalty=l2, solver=lbfgs;, score=0.349 total time=   1.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END .....C=1, penalty=l2, solver=lbfgs;, score=0.396 total time=   1.1s
[CV 1/3] END .C=1, penalty=l2, solver=newton-cg;, score=0.546 total time=   9.0s
[CV 2/3] END .C=1, penalty=l2, solver=newton-cg;, score=0.599 total time=  11.4s
[CV 3/3] END .C=1, penalty=l2, solver=newton-cg;, score=0.599 total time=  11.6s
[CV 1/3] END C=1, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3] END C=1, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3] END C=1, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/3] END C=1, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3] END C=1, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3] END C=1, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3] END C=1, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/3] END C=1, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END .....C=2, penalty=l2, solver=lbfgs;, score=0.413 total time=   1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END .....C=2, penalty=l2, solver=lbfgs;, score=0.393 total time=   1.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END .....C=2, penalty=l2, solver=lbfgs;, score=0.416 total time=   1.1s
[CV 1/3] END .C=2, penalty=l2, solver=newton-cg;, score=0.537 total time=  10.3s
[CV 2/3] END .C=2, penalty=l2, solver=newton-cg;, score=0.594 total time=  13.2s
[CV 3/3] END .C=2, penalty=l2, solver=newton-cg;, score=0.593 total time=  13.3s
[CV 1/3] END C=2, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3] END C=2, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3] END C=2, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/3] END C=2, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3] END C=2, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3] END C=2, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3] END C=2, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/3] END C=2, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END .....C=3, penalty=l2, solver=lbfgs;, score=0.409 total time=   1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END .....C=3, penalty=l2, solver=lbfgs;, score=0.385 total time=   1.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END .....C=3, penalty=l2, solver=lbfgs;, score=0.436 total time=   1.1s
[CV 1/3] END .C=3, penalty=l2, solver=newton-cg;, score=0.529 total time=  11.3s
[CV 2/3] END .C=3, penalty=l2, solver=newton-cg;, score=0.586 total time=  15.6s
[CV 3/3] END .C=3, penalty=l2, solver=newton-cg;, score=0.587 total time=  16.1s
[CV 1/3] END C=3, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3] END C=3, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3] END C=3, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/3] END C=3, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3] END C=3, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3] END C=3, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3] END C=3, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/3] END C=3, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END .....C=5, penalty=l2, solver=lbfgs;, score=0.405 total time=   1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END .....C=5, penalty=l2, solver=lbfgs;, score=0.368 total time=   3.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END .....C=5, penalty=l2, solver=lbfgs;, score=0.386 total time=   1.1s
[CV 1/3] END .C=5, penalty=l2, solver=newton-cg;, score=0.525 total time=  14.1s
[CV 2/3] END .C=5, penalty=l2, solver=newton-cg;, score=0.579 total time=  16.9s
[CV 3/3] END .C=5, penalty=l2, solver=newton-cg;, score=0.578 total time=  16.9s
[CV 1/3] END C=5, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3] END C=5, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3] END C=5, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/3] END C=5, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3] END C=5, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3] END C=5, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3] END C=5, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/3] END C=5, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/3] END ....C=10, penalty=l2, solver=lbfgs;, score=0.406 total time=   1.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/3] END ....C=10, penalty=l2, solver=lbfgs;, score=0.396 total time=   1.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/3] END ....C=10, penalty=l2, solver=lbfgs;, score=0.383 total time=   1.1s
[CV 1/3] END C=10, penalty=l2, solver=newton-cg;, score=0.514 total time=  14.7s
[CV 2/3] END C=10, penalty=l2, solver=newton-cg;, score=0.565 total time=  24.8s
[CV 3/3] END C=10, penalty=l2, solver=newton-cg;, score=0.567 total time=  23.3s
[CV 1/3] END C=10, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 2/3] END C=10, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 3/3] END C=10, penalty=elasticnet, solver=liblinear;, score=nan total time=   0.0s
[CV 1/3] END C=10, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/3] END C=10, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/3] END C=10, penalty=elasticnet, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/3] END C=10, penalty=elasticnet, solver=newton-cg;, score=nan total time=   0.0s
[CV 2/3] END C=10, penalty=elasticnet, solver=newton-cg;, score=nan total time=

75 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nikhilpande/miniconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nikhilpande/miniconda3/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/nikhilpande/miniconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/nikhilpande/miniconda3/lib/python3.10/sit

In [616]:
print(gridlr.best_params_)

{'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


In [685]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

lr = LogisticRegression(C=1, penalty = 'l1', solver = 'liblinear', random_state = 0)
ovo = OneVsOneClassifier(lr)
ovo.fit(X_train_pre_imputed, y_train)

y_pred_validlr = ovo.predict(X_valid_pre_imputed)

report(y_valid, y_pred_validlr)



Confusion Matrix:
 [[806 227  51  33  31]
 [289 586 235  71  32]
 [ 87 219 636 199  37]
 [ 30  60 169 699 213]
 [ 24  37  49 201 817]]
Accuracy: 0.6070572113737581
Macro F1 Score: 0.6075758357909166
ROC AUC Score: 0.7554613976591465




### Gradient Booster

In [856]:
#Gradient Boosting Classifier Grid Search
GBparam_grid = {
    'n_estimators': [30, 50],
    'max_depth': [1, 3],
    'learning_rate': [0.05, 0.1]
    # Add other hyperparameters to tune
}

# Use GridSearchCV for hyperparameter tuning
GBgrid = GridSearchCV(GradientBoostingClassifier(), param_grid=GBparam_grid, cv=3, scoring='f1_macro')
GBgrid.fit(X_train_pre, y_train)

In [857]:
GBgrid.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}

In [681]:
gb = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 50)
gb.fit(X_train_pre_imputed, y_train)

y_pred_validgb = gb.predict(X_valid_pre_imputed)

report(y_valid, y_pred_validgb)


Confusion Matrix:
 [[749 256  57  58  28]
 [306 565 209  94  39]
 [ 98 245 569 200  66]
 [ 48  89 165 619 250]
 [ 33  58  58 218 761]]
Accuracy: 0.5589242891401165
Macro F1 Score: 0.5596171553536291
ROC AUC Score: 0.7252596091543485


### Submit Data

In [687]:
# Submit

submit_pred = ovo.predict(X_test_pre_imputed)

submission_data = {'id':test_id, 'pred':submit_pred}
submission_df = pd.DataFrame(submission_data)
submission_df


Unnamed: 0,id,pred
0,a0,1
1,a1,1
2,a2,1
3,a3,1
4,a4,2
...,...,...
4495,a4495,5
4496,a4496,5
4497,a4497,5
4498,a4498,5


In [648]:
submission_df.to_csv('amazonmulti37.csv', index=False)