In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_palette('muted')
sns.set_color_codes('muted')
sns.set_style('white')

import warnings
warnings.filterwarnings('ignore')

import re
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
%config InlineBackend.figure_format = 'retina'

In [3]:
!pip install openpyxl


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
VideoGames = pd.read_excel('VideoGames.xlsx')

VideoGames.head()

Unnamed: 0,Game name,Year of release,Region,Sales,Platform,Genre,Developer,Publisher,User estimate,Users qty,Critics estimate,Critic qty,Rating
0,Wii Sports,2006.0,NA_Sales,41.36,Wii,Sport games,Nintendo,Nintendo,8.0,322.0,76.0,51.0,E
1,Wii Sports,2006.0,EU_Sales,28.96,Wii,Sport games,Nintendo,Nintendo,8.0,322.0,76.0,51.0,E
2,Wii Sports,2006.0,JP_Sales,3.77,Wii,Sport games,Nintendo,Nintendo,8.0,322.0,76.0,51.0,E
3,Wii Sports,2006.0,Other_Sales,8.45,Wii,Sport games,Nintendo,Nintendo,8.0,322.0,76.0,51.0,E
4,Super Mario Bros.,1985.0,NA_Sales,29.08,NES,Platformers,,Nintendo,,,,,


In [5]:
VideoGames.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66876 entries, 0 to 66875
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Game name         66868 non-null  object 
 1   Year of release   65800 non-null  float64
 2   Region            66876 non-null  object 
 3   Sales             66876 non-null  float64
 4   Platform          66876 non-null  object 
 5   Genre             66868 non-null  object 
 6   Developer         40384 non-null  object 
 7   Publisher         66660 non-null  object 
 8   User estimate     40060 non-null  object 
 9   Users qty         30360 non-null  float64
 10  Critics estimate  32548 non-null  float64
 11  Critic qty        32548 non-null  float64
 12  Rating            39800 non-null  object 
dtypes: float64(5), object(8)
memory usage: 6.6+ MB


In [6]:
VideoGames = VideoGames.loc[VideoGames['Year of release'].isin([2010, 2011, 2012, 2013, 2014, 2015, 2016])]

In [7]:
VideoGames.head()

Unnamed: 0,Game name,Year of release,Region,Sales,Platform,Genre,Developer,Publisher,User estimate,Users qty,Critics estimate,Critic qty,Rating
56,Kinect Adventures!,2010.0,NA_Sales,15.0,X360,Other,Good Science Studio,Microsoft Game Studios,6.3,106.0,61.0,45.0,E
57,Kinect Adventures!,2010.0,EU_Sales,4.89,X360,Other,Good Science Studio,Microsoft Game Studios,6.3,106.0,61.0,45.0,E
58,Kinect Adventures!,2010.0,JP_Sales,0.24,X360,Other,Good Science Studio,Microsoft Game Studios,6.3,106.0,61.0,45.0,E
59,Kinect Adventures!,2010.0,Other_Sales,1.69,X360,Other,Good Science Studio,Microsoft Game Studios,6.3,106.0,61.0,45.0,E
64,Grand Theft Auto V,2013.0,NA_Sales,7.02,PS3,Action,Rockstar North,Take-Two Interactive,8.2,3994.0,97.0,50.0,M


In [8]:
VideoGames = VideoGames.drop(['User estimate', 'Users qty', 'Critics estimate', 'Critic qty'], axis=1)

In [9]:
#Group the dataset by game name and sum the sales
game_sales = VideoGames.groupby('Game name')['Sales'].sum()

#Reset the index
game_sales = game_sales.reset_index()

#Merge the game_sales dataframe with the original dataset on game name
VideoGames = pd.merge(VideoGames, game_sales, on='Game name')

#Rename the Sales column to Total Sales
VideoGames = VideoGames.rename(columns={'Sales_y': 'Total Sales'})

In [10]:
VideoGames.head(10)

Unnamed: 0,Game name,Year of release,Region,Sales_x,Platform,Genre,Developer,Publisher,Rating,Total Sales
0,Kinect Adventures!,2010.0,NA_Sales,15.0,X360,Other,Good Science Studio,Microsoft Game Studios,E,21.82
1,Kinect Adventures!,2010.0,EU_Sales,4.89,X360,Other,Good Science Studio,Microsoft Game Studios,E,21.82
2,Kinect Adventures!,2010.0,JP_Sales,0.24,X360,Other,Good Science Studio,Microsoft Game Studios,E,21.82
3,Kinect Adventures!,2010.0,Other_Sales,1.69,X360,Other,Good Science Studio,Microsoft Game Studios,E,21.82
4,Grand Theft Auto V,2013.0,NA_Sales,7.02,PS3,Action,Rockstar North,Take-Two Interactive,M,56.58
5,Grand Theft Auto V,2013.0,EU_Sales,9.09,PS3,Action,Rockstar North,Take-Two Interactive,M,56.58
6,Grand Theft Auto V,2013.0,JP_Sales,0.98,PS3,Action,Rockstar North,Take-Two Interactive,M,56.58
7,Grand Theft Auto V,2013.0,Other_Sales,3.96,PS3,Action,Rockstar North,Take-Two Interactive,M,56.58
8,Grand Theft Auto V,2013.0,NA_Sales,9.66,X360,Action,Rockstar North,Take-Two Interactive,M,56.58
9,Grand Theft Auto V,2013.0,EU_Sales,5.14,X360,Action,Rockstar North,Take-Two Interactive,M,56.58


In [12]:
VideoGames.to_excel('VideoGamesaltered.xlsx', index=False)

In [None]:
train.drop_duplicates(subset='OriginalTweet').info()

In [None]:
train.TweetAt = pd.to_datetime(train.TweetAt)

In [None]:
train.TweetAt.loc[0]

In [None]:
# tweets_per_day
tweets_per_day = train[['TweetAt']].set_index(train['TweetAt']).resample('D').count()
tweets_per_day

In [None]:
# tweets_per_day_simple  strftime
tweets_per_day_simple = train.TweetAt.dt.strftime('%m-%d').value_counts().sort_index()
tweets_per_day_simple

In [None]:
train.Location.value_counts()

In [None]:
merge_locations1 = ['California, USA', 'Chicago, IL', 'San Francisco, CA', 'USA', 'Los Angeles, CA', 'Washington, DC', 'New York, NY']
train['Location'].replace(merge_locations1, 'United States', inplace=True)

merge_locations2 = ['England, United Kingdom', 'UK', 'London, England', 'London']
train['Location'].replace(merge_locations2, 'United Kingdom', inplace=True)

merge_locations3 = ['Toronto, Ontario']
train['Location'].replace(merge_locations3, 'Canada', inplace=True)


In [None]:
tweets_by_country  = train.Location.value_counts()
tweets_by_country_freq = tweets_by_country[tweets_by_country > 100]
tweets_by_country_freq

In [None]:
tweets_by_country_freq.index.tolist()

In [None]:
train.groupby(['Location','Sentiment']).size().reset_index()

In [None]:
tweets_by_country_sentiment = train.groupby(['Location','Sentiment']).size().reset_index()
tweets_by_country_sentiment.columns = ['Location', 'Sentiment', 'Count']

In [None]:
train['month'] = train.TweetAt.dt.month
train['day'] = train.TweetAt.dt.day
train['dayofweek'] = train.TweetAt.dt.dayofweek
train['weekday'] = train.TweetAt.dt.weekday

In [None]:
train.dayofweek.value_counts()

In [None]:
train['tweetlength'] = train.OriginalTweet.str.len()

In [None]:
train.info()

In [None]:
train['OriginalTweet'][1]

In [None]:
import re

def remove_usernames_links(tweet):
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub('http[^\s]+','',tweet)
    tweet = re.sub(r'[^a-zA-Z\s]', '', tweet)
    tweet = ' '.join(tweet.split())
    return tweet

train['CleanTweet'] = train['OriginalTweet'].apply(remove_usernames_links)

In [None]:
train['CleanTweet'] = train['CleanTweet'].apply(lambda x: x.replace('\n', ' '))
train['CleanTweet'] = train['CleanTweet'].str.lower()

In [None]:
def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)
train['CleanTweet'] = train['CleanTweet'].apply(lambda x: cleaning_repeating_char(x))

In [None]:
train['CleanTweet'][26968]

In [None]:
train.CleanTweet[1].split(' ')

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
from nltk.corpus import wordnet
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

def lemmatize_text(text):
    # Split the text into words
    words = nltk.word_tokenize(text)
    # Lemmatize each word and join them back into a string
    return ' '.join([wnl.lemmatize(word, get_wordnet_pos(word)) for word in words])

# Apply the lemmatization function to the text data
train['CleanTweet'] = train['CleanTweet'].apply(lemmatize_text)

In [None]:
train.CleanTweet[16]

In [None]:
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(train[['CleanTweet']], 
                                                                        train.Sentiment,
                                                                        stratify=train.Sentiment,
                                                                        test_size = 0.25,
                                                                        random_state = 42)

In [None]:
len(X_train_text)

## PCA

In [None]:
from sklearn.base import TransformerMixin
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()
pipeline_dr = Pipeline(steps = [('tf_idf_vec', TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                                                            max_df=.85,
                                                            min_df=.0001,
                                                            stop_words = 'english'
                                                            )
                                ), 
                                ('to_dense', DenseTransformer()),
                                ('pca', PCA(n_components=2000)), 
                                ('classifier', GaussianProcessClassifier())
                            ])

%time pipeline_dr.fit(X_train_text.CleanTweet, y_train_text)

pipeline_dr_pred_train = pipeline_dr.predict(X_train_text.CleanTweet)
pipeline_dr_pred_test = pipeline_dr.predict(X_test_text.CleanTweet)

## DecisionTreeClassifier

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
        
params_grid = dict(min_df=[0.001], max_df=[0.9])

resultsdtc = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps = [
        ('tf_idf_vec', TfidfVectorizer(
            token_pattern=r'[A-Za-z]{2,}',
            max_df=params['max_df'],
            min_df=params['min_df'],
            stop_words='english'
        )), 
        ('classifier', DecisionTreeClassifier())
    ])
    
    pipe.fit(X_train_text['CleanTweet'], y_train_text)
    
    pipe_preds_train = pipe.predict(X_train_text.CleanTweet)
    pipe_preds_test = pipe.predict(X_test_text.CleanTweet)
    
    resultsdtc.append(dict(
        params=params,
        
        precision_train=precision_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        precision_test=precision_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),       
        
        recall_train=recall_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        recall_test=recall_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),
        
        f1_train=f1_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        f1_test=f1_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),
    
        accuracy_train=accuracy_score(y_true=y_train_text, y_pred=pipe_preds_train),
        accuracy_test=accuracy_score(y_true=y_test_text, y_pred=pipe_preds_test)
    ))

In [None]:
resultsdtc = pd.DataFrame(resultsdtc)
resultsdtc.sort_values('accuracy_test', ascending=False).head(10).style.bar(vmin=0, vmax=1)

In [None]:
# worse with grid search than on default

from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
        
params_grid = dict(min_df=[0.001], max_df=[0.9], max_depth=[4, 6, 8], min_samples_split=[2, 5, 10], min_samples_leaf=[1, 2, 4])

resultsdtc = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps = [
        ('tf_idf_vec', TfidfVectorizer(
            token_pattern=r'[A-Za-z]{2,}',
            max_df=params['max_df'],
            min_df=params['min_df'],
            stop_words='english'
        )), 
        ('classifier', DecisionTreeClassifier(
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            min_samples_leaf=params['min_samples_leaf'],
            random_state=42
        ))
    ])
    
    pipe.fit(X_train_text['CleanTweet'], y_train_text)
    
    pipe_preds_train = pipe.predict(X_train_text.CleanTweet)
    pipe_preds_test = pipe.predict(X_test_text.CleanTweet)
    
    resultsdtc.append(dict(
        params=params,
        
        precision_train=precision_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        precision_test=precision_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),       
        
        recall_train=recall_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        recall_test=recall_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),
        
        f1_train=f1_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        f1_test=f1_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),
    
        accuracy_train=accuracy_score(y_true=y_train_text, y_pred=pipe_preds_train),
        accuracy_test=accuracy_score(y_true=y_test_text, y_pred=pipe_preds_test)
    ))

In [None]:
resultsdtc = pd.DataFrame(resultsdtc)
resultsdtc.sort_values('accuracy_test', ascending=False).head(10).style.bar(vmin=0, vmax=1)

## Params for TF-IDF

In [None]:
#Separate the pipe for finding best params for vectorizer first for svc

from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm

params_grid = dict(min_df=[.0001, .0005, .0007, .001, .005, .01], max_df=[.7, .75, .8, .85, .9])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    tfidf = TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                            max_df=params['max_df'],
                            min_df=params['min_df'],
                            stop_words='english')
    
    tfidf.fit(X_train_text['CleanTweet'])
    X_train_tfidf = tfidf.transform(X_train_text['CleanTweet'])
    X_test_tfidf = tfidf.transform(X_test_text['CleanTweet'])
    
    clf = LinearSVC()
    clf.fit(X_train_tfidf, y_train_text)
    y_pred = clf.predict(X_test_tfidf)
    
    acc = accuracy_score(y_test_text, y_pred)
    
    results.append(dict(
        params=params,
        accuracy=acc
    ))

df = pd.DataFrame(results).sort_values(by='accuracy', ascending=False)
print(df)

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# encode target variable
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train_text)
y_test_enc = le.transform(y_test_text)

params_grid = dict(min_df=[.0001, .0005, .0007, .001, .005, .01], max_df=[.7, .75, .8, .85, .9])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    tfidf = TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                            max_df=params['max_df'],
                            min_df=params['min_df'],
                            stop_words='english')
    
    tfidf.fit(X_train_text['CleanTweet'])
    X_train_tfidf = tfidf.transform(X_train_text['CleanTweet'])
    X_test_tfidf = tfidf.transform(X_test_text['CleanTweet'])
    
    etc = XGBClassifier()
    etc.fit(X_train_tfidf, y_train_enc)
    y_pred_enc = etc.predict(X_test_tfidf)
    y_pred = le.inverse_transform(y_pred_enc)
    
    acc = accuracy_score(y_test_text, y_pred)
    
    results.append(dict(
        params=params,
        accuracy=acc
    ))

df = pd.DataFrame(results).sort_values(by='accuracy', ascending=False)
print(df)

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

params_grid = dict(min_df=[.0001, .0005, .0007, .001, .005, .01], max_df=[.7, .75, .8, .85, .9])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    tfidf = TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                            max_df=params['max_df'],
                            min_df=params['min_df'],
                            stop_words='english')
    
    tfidf.fit(X_train_text['CleanTweet'])
    X_train_tfidf = tfidf.transform(X_train_text['CleanTweet'])
    X_test_tfidf = tfidf.transform(X_test_text['CleanTweet'])
    
    clf = ExtraTreesClassifier()
    clf.fit(X_train_tfidf, y_train_text)
    y_pred = clf.predict(X_test_tfidf)
    
    acc = accuracy_score(y_test_text, y_pred)
    
    results.append(dict(
        params=params,
        accuracy=acc
    ))

df = pd.DataFrame(results).sort_values(by='accuracy', ascending=False)
print(df)

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

params_grid = dict(min_df=[.0001, .0005, .0007, .001, .005, .01], max_df=[.7, .75, .8, .85, .9])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    tfidf = TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                            max_df=params['max_df'],
                            min_df=params['min_df'],
                            stop_words='english')
    
    tfidf.fit(X_train_text['CleanTweet'])
    X_train_tfidf = tfidf.transform(X_train_text['CleanTweet'])
    X_test_tfidf = tfidf.transform(X_test_text['CleanTweet'])
    
    clf = MultinomialNB()
    clf.fit(X_train_tfidf, y_train_text)
    y_pred = clf.predict(X_test_tfidf)
    
    acc = accuracy_score(y_test_text, y_pred)
    
    results.append(dict(
        params=params,
        accuracy=acc
    ))

df = pd.DataFrame(results).sort_values(by='accuracy', ascending=False)
print(df)

### The best params for TF-IDF will be
#### LinearSVC: 'max_df': 0.85, 'min_df': 0.0005 
#### CatBoost: takes too long
#### XGBoost: 'max_df': 0.8, 'min_df': 0.0005
#### MultinomialNB: 'max_df': 0.8, 'min_df': 0.001
#### GPC: requires numpy array
#### ExtraTreesClassifier: 'max_df': 0.85, 'min_df': 0.0007

## CatBoost

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier
from tqdm import tqdm
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

params_grid = dict(min_df=[.0005], max_df=[0.85])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps = [('tf_idf_vec', TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                                                            max_df=params['max_df'],
                                                            min_df=params['min_df'],
                                                            stop_words = 'english'
                                                            ))
                            ])

    # Define the parameter grid for the CatBoostClassifier
    param_grid = {
        'classifier__n_estimators': [100, 500, 1000],
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.01, 0.05, 0.1]
    }

    # Add the CatBoostClassifier to the pipeline
    pipe.steps.append(['classifier', CatBoostClassifier()])

    # Perform the grid search
    grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy')
    grid_search.fit(X_train_text['CleanTweet'], y_train_text)

    # Print the best parameters
    print(f"Best parameters: {grid_search.best_params_}")

In [None]:
results = pd.DataFrame(results)
results.sort_values('accuracy_test', ascending=False).head(10).style.bar(vmin=0, vmax=1)

## XGBoost

In [None]:
from sklearn.preprocessing import LabelEncoder

# encode target variable
le = LabelEncoder()
y_train_text_enc = le.fit_transform(y_train_text)
y_test_text_enc = le.transform(y_test_text)

params_grid = dict(min_df=[0.0005], max_df=[0.8], learning_rate=[0.01], max_depth=[9], 
                   subsample=[1.0], n_estimators=[400])

resultsxgb = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps = [
        ('tf_idf_vec', TfidfVectorizer(
            token_pattern=r'[A-Za-z]{2,}',
            max_df=params['max_df'],
            min_df=params['min_df'],
            stop_words='english'
        )), 
        ('classifier', XGBClassifier(
            learning_rate=params['learning_rate'],
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            subsample=params['subsample'],
            verbosity=0
        ))
    ])
    
    pipe.fit(X_train_text['CleanTweet'], y_train_text_enc)
    
    pipe_preds_train = pipe.predict(X_train_text.CleanTweet)
    pipe_preds_test = pipe.predict(X_test_text.CleanTweet)
    
    # decode predicted labels
    pipe_preds_train_dec = le.inverse_transform(pipe_preds_train)
    pipe_preds_test_dec = le.inverse_transform(pipe_preds_test)
    
    resultsxgb.append(dict(
        params=params,
        
        precision_train=precision_score(y_true=y_train_text_enc, y_pred=pipe_preds_train, average='macro'),
        precision_test=precision_score(y_true=y_test_text_enc, y_pred=pipe_preds_test, average='macro'),       
        
        recall_train=recall_score(y_true=y_train_text_enc, y_pred=pipe_preds_train, average='macro'),
        recall_test=recall_score(y_true=y_test_text_enc, y_pred=pipe_preds_test, average='macro'),
        
        f1_train=f1_score(y_true=y_train_text_enc, y_pred=pipe_preds_train, average='macro'),
        f1_test=f1_score(y_true=y_test_text_enc, y_pred=pipe_preds_test, average='macro'),
    
        accuracy_train=accuracy_score(y_true=y_train_text_enc, y_pred=pipe_preds_train),
        accuracy_test=accuracy_score(y_true=y_test_text_enc, y_pred=pipe_preds_test),
        
        # add decoded predicted labels
        preds_train_dec=pipe_preds_train_dec,
        preds_test_dec=pipe_preds_test_dec
    ))

In [None]:
resultsxgb = pd.DataFrame(resultsxgb)
resultsxgb

## MultinomialNB

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

params_grid = dict(min_df=[0.001], 
                   max_df=[0.8],
                   alpha=[0.1, 0.5, 1, 2, 5],
                   )

results = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps=[('tf_idf_vec', TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                                                            max_df=params['max_df'],
                                                            min_df=params['min_df'],
                                                            stop_words='english'
                                                            )
                            ),
                            ('classifier', MultinomialNB(alpha=params['alpha']))
                            ])

    pipe.fit(X_train_text['CleanTweet'], y_train_text)

    pipe_preds_train = pipe.predict(X_train_text.CleanTweet)
    pipe_preds_test = pipe.predict(X_test_text.CleanTweet)

    results.append(dict(

        params=params,

        precision_train=precision_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        precision_test=precision_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        recall_train=recall_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        recall_test=recall_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        f1_train=f1_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        f1_test=f1_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        accuracy_train=accuracy_score(y_true=y_train_text, y_pred=pipe_preds_train),
        accuracy_test=accuracy_score(y_true=y_test_text, y_pred=pipe_preds_test)
    ))

In [None]:
results = pd.DataFrame(results)
results.sort_values('accuracy_test', ascending=False).head(10).style.bar(vmin=0, vmax=1)

## GaussianProcessClassifier

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

params_grid = dict(
    min_df=[0.0005],
    max_df=[0.85],
    kernel=[1.0 * RBF(length_scale=1.0), 1.0 * RBF(length_scale=0.5), 1.0 * RBF(length_scale=2.0)],
)

results = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps=[
        ('tf_idf_vec', TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                                       max_df=params['max_df'],
                                       min_df=params['min_df'],
                                       stop_words='english')),
        ('classifier', GaussianProcessClassifier(kernel=params['kernel']))
    ])

    pipe.fit(X_train_text['CleanTweet'], y_train_text)

    pipe_preds_train = pipe.predict(X_train_text.CleanTweet)
    pipe_preds_test = pipe.predict(X_test_text.CleanTweet)

    results.append(dict(
        params=params,

        precision_train=precision_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        precision_test=precision_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        recall_train=recall_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        recall_test=recall_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        f1_train=f1_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        f1_test=f1_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        accuracy_train=accuracy_score(y_true=y_train_text, y_pred=pipe_preds_train),
        accuracy_test=accuracy_score(y_true=y_test_text, y_pred=pipe_preds_test)
    ))

In [None]:
results = pd.DataFrame(results)
results.sort_values('accuracy_test', ascending=False).head(10).style.bar(vmin=0, vmax=1)

## ExtraTreesClassifier

In [None]:
# runs better on default than on grid search
# 54% vs. 28%

from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

params_grid = dict(min_df=[0.0007],
                   max_df=[0.85],
                   max_depth=[6, 7, 8, 9],
                   n_estimators=[100, 200, 300],
                   min_samples_split=[2, 3, 4])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps=[('tf_idf_vec', TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                                                          max_df=params['max_df'],
                                                          min_df=params['min_df'],
                                                          stop_words='english')
                            ),
                           ('classifier', ExtraTreesClassifier(max_depth=params['max_depth'],
                                                                n_estimators=params['n_estimators'],
                                                                min_samples_split=params['min_samples_split']))
                           ])

    pipe.fit(X_train_text['CleanTweet'], y_train_text)

    pipe_preds_train = pipe.predict(X_train_text.CleanTweet)
    pipe_preds_test = pipe.predict(X_test_text.CleanTweet)

    results.append(dict(

        params=params,

        precision_train=precision_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        precision_test=precision_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        recall_train=recall_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        recall_test=recall_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        f1_train=f1_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        f1_test=f1_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        accuracy_train=accuracy_score(y_true=y_train_text, y_pred=pipe_preds_train),
        accuracy_test=accuracy_score(y_true=y_test_text, y_pred=pipe_preds_test)
    ))

In [None]:
results = pd.DataFrame(results)
results.sort_values('accuracy_test', ascending=False).head(10).style.bar(vmin=0, vmax=1)

## SVC

In [None]:
from sklearn.svm import SVC

params_grid = dict(min_df=[0.0005],
                   max_df=[0.85],
                   C=[1, 10, 100],
                   gamma=[0.1, 0.01, 0.001],
                   kernel=['linear', 'rbf'])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps=[('tf_idf_vec', TfidfVectorizer(token_pattern=r'[A-Za-z]{2,}',
                                                          max_df=params['max_df'],
                                                          min_df=params['min_df'],
                                                          stop_words='english')
                            ),
                           ('classifier', SVC(C=params['C'],
                                               gamma=params['gamma'],
                                               kernel=params['kernel']))
                           ])

    pipe.fit(X_train_text['CleanTweet'], y_train_text)

    pipe_preds_train = pipe.predict(X_train_text.CleanTweet)
    pipe_preds_test = pipe.predict(X_test_text.CleanTweet)

    results.append(dict(

        params=params,

        precision_train=precision_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        precision_test=precision_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        recall_train=recall_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        recall_test=recall_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        f1_train=f1_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        f1_test=f1_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),

        accuracy_train=accuracy_score(y_true=y_train_text, y_pred=pipe_preds_train),
        accuracy_test=accuracy_score(y_true=y_test_text, y_pred=pipe_preds_test)
    ))

In [None]:
results = pd.DataFrame(results)
results.sort_values('accuracy_test', ascending=False).head(10).style.bar(vmin=0, vmax=1)

# Results

|  Model                |  Default accuracy |  Grid accuracy |
|-----------------------|-------------------|----------------|
| XGBoost               |  0.522687         |  0.458778      |
| SVC                   |  0.537663         |  0.590896      |
| DecisionTreeClassifier|  0.414442         |  0.339413      |
| ExtraTreesClassifier  |  0.548636         |  0.288998      |
| MultinominalNB        |  0.440391         |  0.452254      |
| CatBoost              |        -          |       -        |
| GaussianPC            |        -          |       -        |

## Embeddings

#### GPT2

In [None]:
!pip install torch

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import GPT2Tokenizer, GPT2Model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from transformers import GPT2Tokenizer, GPT2Model
import torch 

def texts_to_embeddings(texts):
    tokenizer.pad_token = '[PAD]'
    input_ids = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)['input_ids']
    with torch.no_grad():
        embeddings = model(input_ids).last_hidden_state[:, 0, :]
    return embeddings.numpy()

train_texts = train['CleanTweet'].tolist()  # convert to list of strings
train_embeddings = texts_to_embeddings(train_texts)

In [None]:
y = train['Sentiments']
X_train, X_test, y_train, y_test = train_test_split(train_embeddings, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

#### XLNet

In [None]:
!pip install SentencePiece

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import XLNetTokenizer, XLNetModel
import torch

# Instantiate the XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')

def xlnet_embeddings(text):
    # Tokenize the text
    input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True)])
    
    # Generate embeddings
    with torch.no_grad():
        output = model(input_ids)[0][:, 0, :].numpy()  # Use the first token as the embedding
    
    return output

params_grid = dict(min_df=[.0001, .0005, .0007, .001, .005, .01], max_df=[.7, .75, .8, .85, .9],max_depth=[6,7,8,9])

results = []

for params in tqdm(ParameterGrid(params_grid)):
    pipe = Pipeline(steps = [('embedding', FunctionTransformer(xlnet_embeddings, validate=False)),
                              ('classifier',DecisionTreeClassifier(max_depth=params['max_depth']
                                                                      ))
                            ])
    
    pipe.fit(X_train_text['CleanTweet'], y_train_text)
    
    
    pipe_preds_train = pipe.predict(X_train_text.CleanTweet)
    pipe_preds_test = pipe.predict(X_test_text.CleanTweet)
    
    results.append(dict(
        
        params=params,
        
        precision_train = precision_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        precision_test = precision_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),       
        
        recall_train = recall_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        recall_test = recall_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),
        
        f1_train = f1_score(y_true=y_train_text, y_pred=pipe_preds_train, average='macro'),
        f1_test = f1_score(y_true=y_test_text, y_pred=pipe_preds_test, average='macro'),
    
        accuracy_train = accuracy_score(y_true=y_train_text, y_pred=pipe_preds_train),
        accuracy_test = accuracy_score(y_true=y_test_text, y_pred=pipe_preds_test)
    ))

In [None]:
results = pd.DataFrame(results)
results.sort_values('accuracy_test', ascending=False).head(10).style.bar(vmin=0, vmax=1)