# Libraries

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# For text processing
from sklearn.feature_extraction.text import TfidfVectorizer

# For model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Machine Learning models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# For handling sparse matrices
from scipy.sparse import hstack, csr_matrix

# For scaling numeric features
from sklearn.preprocessing import StandardScaler

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


# Loading the Files

Download the csv files into the `data/` directory.

In [9]:
# Load the CSV files into DataFrames
train_df_full = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
print("Full train data shape:", train_df_full.shape)
print("Test data shape:", test_df.shape)

# Sampling 50% of the Training Data
train_df = train_df_full.sample(frac=0.5, random_state=42).reset_index(drop=True)
print("Sampled train data shape:", train_df.shape)

# Display the first few rows of the sampled training data
print("First few rows of sampled training data:")
print(train_df.head())

# Check for missing values in the sampled training data
print("Missing values in sampled training data:")
print(train_df.isnull().sum())


Full train data shape: (1697533, 9)
Test data shape: (212192, 2)
Sampled train data shape: (848766, 9)
First few rows of sampled training data:
        Id   ProductId          UserId  HelpfulnessNumerator  \
0   565351  B000053V8F  A3GAGA790C3F90                     0   
1   450021  6305126062  A2U9M4SE42KFK8                     0   
2  1124790  B00105308I  A1AISPOIIHTHXX                     0   
3  1521546  B006WQUL64  A2435XASRUN5O0                     1   
4   749431  B00008YGRS  A3I7UACA67A3NP                     0   

   HelpfulnessDenominator        Time  \
0                       1  1370304000   
1                       0  1379894400   
2                       0  1206403200   
3                       1  1374364800   
4                       1  1389139200   

                                             Summary  \
0              Technically Competent, but Lacks Soul   
1                              Redford story telling   
2       4.5; Laughing at the outrageous and shocking   


# Adding Features 

In [10]:
def add_features_to(df):
    """
    Adds new features to the DataFrame and handles missing values.
    """
    # Handle missing values in 'HelpfulnessDenominator' to avoid division by zero
    df['HelpfulnessDenominator'] = df['HelpfulnessDenominator'].replace(0, np.nan)
    df['Helpfulness'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator']
    df['Helpfulness'] = df['Helpfulness'].fillna(0)

    # Convert 'Time' to datetime and extract date components
    df['Time'] = pd.to_datetime(df['Time'], unit='s')
    df['Review_Year'] = df['Time'].dt.year
    df['Review_Month'] = df['Time'].dt.month
    df['Review_Day'] = df['Time'].dt.day

    # Fill NaN values in 'Summary' and 'Text'
    df['Summary'] = df['Summary'].fillna('')
    df['Text'] = df['Text'].fillna('')

    # Calculate the length of 'Summary' and 'Text'
    df['Summary_length'] = df['Summary'].apply(len)
    df['Text_length'] = df['Text'].apply(len)

    # Calculate word counts in 'Summary' and 'Text'
    df['Summary_word_count'] = df['Summary'].apply(lambda x: len(x.split()))
    df['Text_word_count'] = df['Text'].apply(lambda x: len(x.split()))
    
    return df

# Apply feature engineering to the training data
train_df = add_features_to(train_df)


# Preparing Training and Test Data Model Building and Evaluation

In [11]:
# Training data: Rows where 'Score' is not null
train_data = train_df[train_df['Score'].notnull()].copy()

# Test data: Merge test_df with train_df_full to get all necessary data
test_data = pd.merge(test_df[['Id']], train_df_full.drop(columns=['Score']), on='Id', how='left')

# Combine 'Summary' and 'Text' into 'Combined_Text' for both train and test data
train_data['Combined_Text'] = train_data['Summary'] + ' ' + train_data['Text']
test_data['Combined_Text'] = test_data['Summary'] + ' ' + test_data['Text']


# Text Vectorization

In [12]:
# Ensure 'Summary' and 'Text' columns have no NaN values
train_data['Summary'] = train_data['Summary'].fillna('').astype(str)
train_data['Text'] = train_data['Text'].fillna('').astype(str)
test_data['Summary'] = test_data['Summary'].fillna('').astype(str)
test_data['Text'] = test_data['Text'].fillna('').astype(str)

# Create 'Combined_Text' column by concatenating 'Summary' and 'Text'
train_data['Combined_Text'] = (train_data['Summary'] + ' ' + train_data['Text']).fillna('').astype(str)
test_data['Combined_Text'] = (test_data['Summary'] + ' ' + test_data['Text']).fillna('').astype(str)

# Check if any NaN values are present in the 'Combined_Text' column
print("Number of NaN values in train_data['Combined_Text']:", train_data['Combined_Text'].isna().sum())
print("Number of NaN values in test_data['Combined_Text']:", test_data['Combined_Text'].isna().sum())

# Combine the text data for vectorization
combined_text = pd.concat([train_data['Combined_Text'], test_data['Combined_Text']], axis=0)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=8000, ngram_range=(1, 2), stop_words='english')

# Fit and transform the combined text data
tfidf_combined_text = tfidf_vectorizer.fit_transform(combined_text)

# Split the vectorized data back into training and test sets
tfidf_train = tfidf_combined_text[:len(train_data)]
tfidf_test = tfidf_combined_text[len(train_data):]


Number of NaN values in train_data['Combined_Text']: 0
Number of NaN values in test_data['Combined_Text']: 0


#  Preparing Numerical Features

In [13]:
# Prepare Numerical Features
# ----------------------------
# List of numerical features to include
numeric_features = [
    'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Helpfulness',
    'Summary_length', 'Text_length', 'Summary_word_count', 'Text_word_count',
    'Review_Year', 'Review_Month', 'Review_Day'
]

# Ensure that all the numeric features are present in both train_data and test_data
# Filter only the features that exist in both datasets
numeric_features = [
    feature for feature in numeric_features
    if feature in train_data.columns and feature in test_data.columns
]

# Check which features are missing from the train and test data
missing_train_features = set(numeric_features) - set(train_data.columns)
missing_test_features = set(numeric_features) - set(test_data.columns)

print(f"Missing features in train_data: {missing_train_features}")
print(f"Missing features in test_data: {missing_test_features}")

# Prepare numeric features for training and test data
X_train_numeric = train_data[numeric_features].fillna(0)
X_test_numeric = test_data[numeric_features].fillna(0)

# Verify that X_train_numeric and X_test_numeric are defined and have the expected shape
print("X_train_numeric shape:", X_train_numeric.shape)
print("X_test_numeric shape:", X_test_numeric.shape)


Missing features in train_data: set()
Missing features in test_data: set()
X_train_numeric shape: (742644, 2)
X_test_numeric shape: (212192, 2)


# Combining Features


In [14]:
# Combining Features
# ------------------
from scipy.sparse import hstack, csr_matrix

# Convert numeric features to sparse matrices
# Ensure that X_train_numeric and X_test_numeric are correctly prepared
X_train_num_sparse = csr_matrix(X_train_numeric.values)
X_test_num_sparse = csr_matrix(X_test_numeric.values)

# Combine TF-IDF text features with numeric features
# Ensure that `tfidf_train` and `tfidf_test` were prepared correctly during text vectorization
print(f"Shape of tfidf_train: {tfidf_train.shape}")
print(f"Shape of X_train_num_sparse: {X_train_num_sparse.shape}")
print(f"Shape of tfidf_test: {tfidf_test.shape}")
print(f"Shape of X_test_num_sparse: {X_test_num_sparse.shape}")

# Ensure the shapes of the TF-IDF and numeric matrices are compatible before combining
X_train_combined = hstack([tfidf_train, X_train_num_sparse])
X_test_combined = hstack([tfidf_test, X_test_num_sparse])

# Check the final shapes after combining
print(f"X_train_combined shape: {X_train_combined.shape}")
print(f"X_test_combined shape: {X_test_combined.shape}")

# Define the target variable (make sure it exists in train_data)
y_train = train_data['Score']

# Verify that y_train has the correct shape
print(f"y_train shape: {y_train.shape}")


Shape of tfidf_train: (742644, 8000)
Shape of X_train_num_sparse: (742644, 2)
Shape of tfidf_test: (212192, 8000)
Shape of X_test_num_sparse: (212192, 2)
X_train_combined shape: (742644, 8002)
X_test_combined shape: (212192, 8002)
y_train shape: (742644,)


# Model Building and Evaluation

In [15]:
# Split the training data into training and validation sets
X_train_part, X_valid, y_train_part, y_valid = train_test_split(
    X_train_combined, y_train, test_size=0.25, random_state=42)

# Logistic Regression Model
print("Training Logistic Regression model...")
logreg = LogisticRegression(max_iter=5000, n_jobs=-1)
logreg.fit(X_train_part, y_train_part)

# Predict and evaluate Logistic Regression
y_pred_logreg = logreg.predict(X_valid)
accuracy_logreg = accuracy_score(y_valid, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)

# Multinomial Naive Bayes Model
print("Training Multinomial Naive Bayes model...")
mnb = MultinomialNB()
mnb.fit(X_train_part, y_train_part)

# Predict and evaluate Multinomial Naive Bayes
y_pred_mnb = mnb.predict(X_valid)
accuracy_mnb = accuracy_score(y_valid, y_pred_mnb)
print("Multinomial Naive Bayes Accuracy:", accuracy_mnb)

# Random Forest Classifier Model
print("Training Random Forest Classifier model...")
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf_classifier.fit(X_train_part, y_train_part)

# Predict and evaluate Random Forest Classifier
y_pred_rf = rf_classifier.predict(X_valid)
accuracy_rf = accuracy_score(y_valid, y_pred_rf)
print("Random Forest Classifier Accuracy:", accuracy_rf)


Training Logistic Regression model...
Logistic Regression Accuracy: 0.6562983071296611
Training Multinomial Naive Bayes model...
Multinomial Naive Bayes Accuracy: 0.5817430693575926
Training Random Forest Classifier model...
Random Forest Classifier Accuracy: 0.5968781812012216


# Model Selection

In [16]:
# Compare model accuracies
print("\nModel Accuracies:")
print(f"Logistic Regression Accuracy: {accuracy_logreg}")
print(f"Multinomial Naive Bayes Accuracy: {accuracy_mnb}")
print(f"Random Forest Classifier Accuracy: {accuracy_rf}")

# Select the best model based on validation accuracy
accuracies = {
    'Logistic Regression': accuracy_logreg,
    'Multinomial Naive Bayes': accuracy_mnb,
    'Random Forest Classifier': accuracy_rf
}

best_model_name = max(accuracies, key=accuracies.get)
print(f"\nBest model based on validation accuracy: {best_model_name}")



Model Accuracies:
Logistic Regression Accuracy: 0.6562983071296611
Multinomial Naive Bayes Accuracy: 0.5817430693575926
Random Forest Classifier Accuracy: 0.5968781812012216

Best model based on validation accuracy: Logistic Regression


# Final Model Training and Submission

In [17]:
# Retrain the best model on the entire training data
print(f"\nRetraining the best model ({best_model_name}) on the entire training data...")

if best_model_name == 'Logistic Regression':
    best_model = LogisticRegression(max_iter=5000, n_jobs=-1)
    best_model.fit(X_train_combined, y_train)
    y_test_pred = best_model.predict(X_test_combined)

elif best_model_name == 'Multinomial Naive Bayes':
    best_model = MultinomialNB()
    best_model.fit(X_train_combined, y_train)
    y_test_pred = best_model.predict(X_test_combined)

elif best_model_name == 'Random Forest Classifier':
    best_model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    best_model.fit(X_train_combined, y_train)
    y_test_pred = best_model.predict(X_test_combined)
else:
    print("No valid model selected.")
    y_test_pred = None

# Prepare the submission file
submission = test_df[['Id']].copy()
submission['Score'] = y_test_pred

# Save the submission file to CSV
submission.to_csv('submission.csv', index=False)

print("\nSubmission file created successfully!")



Retraining the best model (Logistic Regression) on the entire training data...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Submission file created successfully!
