In [5]:
%matplotlib inline

import pickle
import pandas as pd
import seaborn as sns
from os.path import exists
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

from textblob import TextBlob
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
)
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, QuantileTransformer,
    PowerTransformer
)
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
trainingSet = pd.read_csv("./data/train.csv")
testingSet = pd.read_csv("./data/test.csv")

trainingSet['CombinedText'] = trainingSet['Summary'].fillna('') + " " + \
                              trainingSet['Text'].fillna('')

all_df = trainingSet.copy()
all_df.info();

## Separate Train/Test Sets
test_ids = testingSet['Id'].tolist()
test_ids_set = set(test_ids)
print(f'Num Test Samples: {len(test_ids)}')
print(f'Num Unique Test Samples; {len(test_ids_set)}')

train_ids = trainingSet['Id'].tolist()
train_ids_set = set(train_ids)
print(f'Num Train Samples: {len(train_ids)}')
print(f'Num Unique Train Samples: {len(train_ids_set)}')
print(f'Train/Test overlap: {len(train_ids_set.intersection(test_ids_set))}')

train_df = trainingSet[~trainingSet['Id'].isin(test_ids_set)]
test_df = trainingSet[trainingSet['Id'].isin(test_ids_set)]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1697533 entries, 0 to 1697532
Data columns (total 10 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Id                      int64  
 1   ProductId               object 
 2   UserId                  object 
 3   HelpfulnessNumerator    int64  
 4   HelpfulnessDenominator  int64  
 5   Time                    int64  
 6   Summary                 object 
 7   Text                    object 
 8   Score                   float64
 9   CombinedText            object 
dtypes: float64(1), int64(4), object(5)
memory usage: 129.5+ MB
Num Test Samples: 212192
Num Unique Test Samples; 212192
Num Train Samples: 1697533
Num Unique Train Samples: 1697533
Train/Test overlap: 212192


In [7]:
### Create Features

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

all_df = trainingSet.copy()

# Helpfulness
all_df['Helpfulness'] = all_df['HelpfulnessNumerator'] / (all_df['HelpfulnessDenominator'] + 1e-7)
all_df['DownVotes'] = all_df['HelpfulnessDenominator'] - all_df['HelpfulnessNumerator']


# ProductID Average Score & Variance
avg_score_per_product = train_df.groupby('ProductId')['Score'].mean().reset_index()
avg_score_per_product.columns = ['ProductId', 'AvgScore']
all_df = all_df.merge(avg_score_per_product, on='ProductId', how='left')

score_variance_per_product = train_df.groupby('ProductId')['Score'].var().reset_index()
score_variance_per_product.columns = ['ProductId', 'ScoreVariance']
all_df = all_df.merge(score_variance_per_product, on='ProductId', how='left')
    
# ProductID Popularity Score
product_review_count = train_df['ProductId'].value_counts().reset_index()
product_review_count.columns = ['ProductId', 'ReviewCount']
all_df = all_df.merge(product_review_count, on='ProductId', how='left')

# Time features
all_df['TimeYear'] = pd.to_datetime(all_df['Time'], unit='s').dt.year
all_df['TimeMonth'] = pd.to_datetime(all_df['Time'], unit='s').dt.month
all_df['TimeDays'] = (train_df['Time'].max() - all_df['Time']) / (60 * 60 * 24)

# Text Features
def get_sentiment(text, discrete=False):
    text = str(text)
    score = TextBlob(text).sentiment.polarity
    return score

def get_sentiment_discrete(sentiment):
    score = float(sentiment)
    if score > 0.05:
        return 1
    elif score < -0.05:
        return -1
    return 0

def get_subjectivity(text):
    text = str(text)
    return TextBlob(text).sentiment.subjectivity


print(f'Getting sentiments..')
all_df['SummarySentiment'] = all_df['Summary'].parallel_apply(get_sentiment)
all_df['TextSentiment'] = all_df['Text'].parallel_apply(get_sentiment)

print(f'Getting discrete sentiments..')
all_df['SummarySentimentDiscrete'] = all_df['SummarySentiment'].parallel_apply(get_sentiment_discrete)
all_df['TextSentimentDiscrete'] = all_df['TextSentiment'].parallel_apply(get_sentiment_discrete)

print(f'Getting subjectivities..')
all_df['SummarySubjectivity'] = all_df['Summary'].parallel_apply(get_subjectivity)
all_df['TextSubjectivity'] = all_df['Text'].parallel_apply(get_subjectivity)

print(f'Getting text info..')
all_df['TextLength'] = all_df['Text'].parallel_apply(lambda x: len(str(x)))
all_df['TextExclamations'] = all_df['Text'].parallel_apply(lambda x: str(x).count('!'))
all_df['TextQuestions'] = all_df['Text'].parallel_apply(lambda x: str(x).count('?'))
all_df['TextCapitalLetters'] = all_df['Text'].parallel_apply(lambda x: sum(1 for letter in str(x) if letter == letter.upper()))


# Create all features X
features = [
    'HelpfulnessNumerator', 'HelpfulnessDenominator',
    'Helpfulness', 'DownVotes', 
    'Time', 'TimeYear', 'TimeMonth', 'TimeDays',
    'AvgScore', 'ScoreVariance',
    'ReviewCount',
    'SummarySentiment', 'TextSentiment',
    'SummarySentimentDiscrete', 'TextSentimentDiscrete',
    'SummarySubjectivity', 'TextSubjectivity',
    'TextLength',
    'TextExclamations', 'TextQuestions', 'TextCapitalLetters'
]

# # TDIDF Features
# all_df['Combined_Text'] = all_df['Summary'].fillna('') + " " + all_df['Text'].fillna('')
# tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')  # Adjust as needed
# tfidf_matrix = tfidf.fit_transform(all_df['Combined_Text'])
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
# all_df = pd.concat([all_df.reset_index(drop=True), 
#                     tfidf_df.reset_index(drop=True)], axis=1)
# tfidf_features = tfidf.get_feature_names_out().tolist()
# features.extend(tfidf_features)

print(f'\n* Using a total of {len(features)} features.')
X = all_df[features]

INFO: Pandarallel will run on 12 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
Getting sentiments..


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…

Getting discrete sentiments..


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…

Getting subjectivities..


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…

Getting text info..


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=141462), Label(value='0 / 141462')…


* Using a total of 21 features.


In [8]:
# TFIDF Features
tfidf = TfidfVectorizer(max_features=800, 
                        ngram_range=(1, 2), 
                        stop_words='english')
print(f'Fitting TFIDF.. ')
tfidf_matrix_train = tfidf.fit_transform(train_df['CombinedText'])
print(f'Predicting with fitted TFIDF.. ')
tfidf_matrix = tfidf.transform(all_df['CombinedText'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), 
                        columns=tfidf.get_feature_names_out())
all_df = pd.concat([all_df.reset_index(drop=True), 
                    tfidf_df.reset_index(drop=True)], axis=1)
tfidf_features = tfidf.get_feature_names_out().tolist()

print(f'TFIDF -> {len(tfidf_features)} features')
print(tfidf_features)
features.extend(tfidf_features)

Fitting TFIDF.. 
TFIDF -> 800 features
['10', '20', '30', '34', '8217', 'able', 'absolutely', 'act', 'acted', 'acting', 'action', 'actor', 'actors', 'actress', 'actual', 'actually', 'add', 'added', 'addition', 'adventure', 'age', 'ago', 'air', 'alien', 'amazing', 'amazon', 'america', 'american', 'animated', 'animation', 'anime', 'appears', 'aren', 'art', 'attempt', 'attention', 'audience', 'audio', 'available', 'average', 'away', 'awesome', 'baby', 'background', 'bad', 'band', 'based', 'basically', 'batman', 'battle', 'beautiful', 'beauty', 'begin', 'beginning', 'begins', 'believable', 'believe', 'best', 'better', 'big', 'bit', 'black', 'blood', 'blu', 'blu ray', 'blue', 'body', 'bond', 'bonus', 'book', 'books', 'boring', 'bought', 'box', 'boy', 'boys', 'break', 'brilliant', 'bring', 'brings', 'british', 'brother', 'brothers', 'brought', 'bruce', 'budget', 'business', 'buy', 'buying', 'called', 'came', 'camera', 'car', 'care', 'career', 'case', 'cast', 'century', 'certain', 'certainly'

In [9]:
# User Average Score
user_avg_score = train_df.groupby('UserId')['Score'].mean().reset_index()
user_avg_score.columns = ['UserId', 'UserAvgScore']
all_df = all_df.merge(user_avg_score, on='UserId', how='left')
all_df.loc[:, 'UserAvgScore'] = all_df['UserAvgScore'].fillna(0)

# User Score Variance
user_score_variance = train_df.groupby('UserId')['Score'].var().reset_index()
user_score_variance.columns = ['UserId', 'UserScoreVariance']
all_df = all_df.merge(user_score_variance, on='UserId', how='left')
all_df.loc[:, 'UserScoreVariance'] = all_df['UserScoreVariance'].fillna(0)

# User Review Count
user_review_count = train_df['UserId'].value_counts().reset_index()
user_review_count.columns = ['UserId', 'UserReviewCount']
all_df = all_df.merge(user_review_count, on='UserId', how='left')
all_df.loc[:, 'UserReviewCount'] = all_df['UserReviewCount'].fillna(0)

# Update features list
features.extend(['UserAvgScore', 'UserScoreVariance', 'UserReviewCount'])

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Vectorize text using CountVectorizer (required for LDA)
tf_vectorizer = CountVectorizer(max_df=0.95, 
                                min_df=2, 
                                stop_words='english')
tf_train = tf_vectorizer.fit_transform(train_df['CombinedText'])
tf = tf_vectorizer.transform(all_df['CombinedText'])

# Fit LDA model
n_topics = 10  # You can adjust the number of topics
lda = LatentDirichletAllocation(n_components=n_topics, 
                                random_state=42, 
                                n_jobs=-1)
print(f'Fitting LDA with {n_topics} topics...')
lda_topics_train = lda.fit_transform(tf_train)
lda_topics = lda.transform(tf)

# Create a DataFrame with topic distributions
topic_columns = [f'Topic_{i}' for i in range(n_topics)]
lda_df = pd.DataFrame(lda_topics, columns=topic_columns)

# Concatenate the topic distributions with the main DataFrame
all_df = pd.concat([all_df.reset_index(drop=True), lda_df.reset_index(drop=True)], axis=1)

# Update features list
features.extend(topic_columns)

Fitting LDA with 10 topics...


In [11]:
### Data Splitting
print(f'\n* Using a total of {len(features)} features.')
X = all_df[features]
X.loc[:, 'ScoreVariance'] = X['ScoreVariance'].fillna(0)

X_train = X[~all_df['Id'].isin(test_ids_set)]
y_train = train_df['Score']
X_test = X[all_df['Id'].isin(test_ids_set)]


* Using a total of 834 features.


In [12]:
# Train/Val Split
X_trn, X_val, y_trn, y_val = train_test_split(
    X_train, 
    y_train, 
    test_size=0.2, 
    random_state=0
)

In [17]:
# Create and Run Pipe
znorm_features = ['Helpfulness', 'DownVotes', 'AvgScore', 'ScoreVariance', 'ReviewCount',
                  'TextLength', 'TextExclamations', 'TextQuestions', 'TextCapitalLetters']
minmax_features = ['Time', 'TimeYear', 'TimeMonth', 'TimeDays']
# log_features = ['HelpfulnessNumerator', 'HelpfulnessDenominator', 'ReviewCount']
nonlinear_features = ['HelpfulnessNumerator', 'HelpfulnessDenominator']


# Set up the preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('znorm', StandardScaler(), znorm_features),
        ('minmax', MinMaxScaler(), minmax_features),
        # ('quantile', QuantileTransformer(output_distribution='uniform'), 
        #                                  nonlinear_features)  
        ('nonlinear', PowerTransformer(method='yeo-johnson', standardize=False),
         nonlinear_features),
    ],
    remainder='passthrough'  # Keep other columns as they are
)


pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=300, 
                                          min_samples_split=5, 
                                          max_depth=None,
                                          random_state=0,
                                          n_jobs=-1))
])

# Setup ensembly classifiers
# clf3 = RandomForestClassifier(n_estimators=200, 
#                               min_samples_split=5, 
#                               max_depth=None,
#                               random_state=0,
#                               n_jobs=-1)
# clf2 = GradientBoostingClassifier()
# clf1 = LinearSVC(dual=False, penalty='l1')
# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', VotingClassifier(
#         estimators=[('clf1', clf1), ('clf2', clf2), ('clf3', clf3)],
#         voting='soft'  # Soft voting to consider the probabilities
#     ))
# ])

In [18]:
### Train
pipeline.fit(X_trn, y_trn)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [19]:
### Validate on val
y_pred = pipeline.predict(X_val)

# Eval
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

"""
61.45, 61.9
TFIDF 62.76

Hparam Tuning
- 62.43 | Est 50,  min-samp 10, max-depth N
- 62.84 | Est 100, min-samp 10, max-depth N
- 63.04 | Est 200, min-samp 10, max-depth N
- 62.96 | Est 200, min-samp 20, max-depth N
- 63.04 | Est 200, min-samp 5,  max-depth N, ~280 max-depth
- 63.02 | Est 200, min-samp 5,  max-depth 100
-------
[TF-IDF 500 -> 1000]
- 63.13 | Est 200, min-samp 5,  max-depth N, ~320 max-depth
[TF-IDF 500 -> 1000 + PCA 100]
- 62.30 | Est 200, min-samp 5,  max-depth N, ~320 max-depth
- 63.44 |
- 65.71
- 67.36
""";

Validation Accuracy: 0.6744


In [20]:
submit_df = pd.read_csv("./data/sample.csv")
y_test = pipeline.predict(X_test)

submission = pd.DataFrame({
    'Id': test_df['Id'],
    'Score': y_test
})
submit_df = submit_df[['Id']].merge(submission, on='Id', how='left')
submit_df.to_csv('submit.csv', index=False)

In [10]:
rf_model = pipeline.named_steps['classifier']

# Get the maximum depth of each tree
tree_depths = [estimator.tree_.max_depth for estimator in rf_model.estimators_]

# Find the maximum depth among all trees
max_tree_depth = max(tree_depths)

print(f"Maximum depth of the trees in the Random Forest: {max_tree_depth}")

# Optionally, you can check the average depth if you're interested
average_tree_depth = sum(tree_depths) / len(tree_depths)
print(f"Average depth of the trees in the Random Forest: {average_tree_depth:.2f}")

Maximum depth of the trees in the Random Forest: 324
Average depth of the trees in the Random Forest: 240.35
