In [None]:
import math
from os import walk

import joblib
import numpy as np
import pandas as pd
import datetime

In [19]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import chi2
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [20]:
def get_test_train_data():
    filenames = next(walk(BASE_FOLDER), (None, None, []))[2]
    print(filenames)

    train_df, test_df = pd.DataFrame(), pd.DataFrame()
    if len(filenames) == 2:
        train_df = pd.read_csv(filepath_or_buffer=BASE_FOLDER + filenames[0], sep=",")
        test_df = pd.read_csv(filepath_or_buffer=BASE_FOLDER + filenames[1], sep=",")
    elif len(filenames) > 2:
        test_df = pd.read_csv(filepath_or_buffer=BASE_FOLDER + filenames[len(filenames) - 1], sep=",")
        test_df = convert_and_sort_time(test_df)

        train_df = pd.DataFrame()
        for i in range(len(filenames) - 1):
            df_temp = pd.read_csv(filepath_or_buffer=BASE_FOLDER + filenames[i], sep=",")
            df_temp = convert_and_sort_time(df_temp)
            train_df = pd.concat([train_df, pd.DataFrame.from_records(df_temp)])

    return train_df, test_df

In [21]:
def convert_and_sort_time(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['timestamp'] = [i.replace(tzinfo=datetime.timezone.utc) for i in df['timestamp']]
    return df.sort_values(by='timestamp', ascending=True)

In [28]:
def prepare_model_data(df):
    df = df[df['topics_ids'] != -1].copy()
    df['popularity'] = [0 if retweets == 0 else 1 for retweets in df['retweet_count']]
    return df

In [23]:
def split_data(train_df, test_df):
    X_train = train_df.drop('popularity', axis=1)
    y_train = train_df['popularity']
    print(X_train.shape)
    print(y_train.shape)
    X_test = test_df.drop('popularity', axis=1)
    y_test = test_df['popularity']
    print(X_test.shape)
    print(y_test.shape)
    return X_train, y_train, X_test, y_test

In [24]:
num_folds = 7
seed = 7
scoring = 'accuracy'
validation_size = 0.70
BASE_FOLDER = "../../data/processed_tweets/"
vars = ['like_count', 'retweet_count', 'quote_count', 'reply_count', 'reach', 'topics_ids', 'sentiment_enc', 'day_phase_enc', 'day_of_week_enc', 'month_enc', 'popularity', 'followers', 'following', 'tweet_count', 'verified_enc', 'seniority']
tweet_vars = ['like_count', 'retweet_count', 'quote_count', 'reply_count', 'reach', 'sentiment_enc', 'day_phase_enc', 'day_of_week_enc', 'month_enc', 'topics_ids']
users_vars = ['followers', 'following', 'tweet_count', 'verified_enc', 'seniority']
num_vars = ['like_count', 'retweet_count', 'quote_count', 'reply_count', 'reach', 'followers', 'following', 'tweet_count', 'seniority']
cat_vars = ['sentiment_enc', 'verified_enc', 'day_of_week_enc', 'day_phase_enc', 'month_enc']
variables_to_predict = ['followers', 'following', 'tweet_count', 'seniority', 'verified_enc', 'day_phase_enc', 'day_of_week_enc', 'month_enc', 'topics_ids', 'sentiment_enc', 'hashtags_enc']

In [25]:
train_df, test_df = get_test_train_data()

['tweets_2021.csv', 'tweets_2020.csv', 'tweets_2019.csv']


In [29]:
train_df = prepare_model_data(train_df)
test_df = prepare_model_data(test_df)

In [30]:
X_train, y_train, X_test, y_test = split_data(train_df, test_df)

(363328, 34)
(363328,)
(193987, 34)
(193987,)


## Standardization

## Feature Selection

In [31]:
X_train = X_train[variables_to_predict]
X_test = X_test[variables_to_predict]

## Balacing the dataset

### SMOTE Method

In [32]:
# summarize class distribution
print("Before over sampling: ", Counter(y_train))
over_sample = SMOTE(random_state=seed)
X_train_over, y_train_over = over_sample.fit_resample(X_train, y_train)
# summarize class distribution
print("After over sampling: ", Counter(y_train_over))

Before over sampling:  Counter({0: 304662, 1: 58666})
After over sampling:  Counter({0: 304662, 1: 304662})


## Models

In [None]:
def compare_models(models, X_train, y_train, scoring, num_folds):
    results = []
    names = []
    df_res = pd.DataFrame(columns=['model', 'score_accuracy'])
    for name, model in models:
        kfold = KFold(n_splits=num_folds)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)

        df_res.loc[len(df_res.index)] = [name, cv_results]

        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
    return df_res

In [51]:
models = [('LR', LogisticRegression(solver='lbfgs')), ('LDA', LinearDiscriminantAnalysis()), ('CART', DecisionTreeClassifier()), ('NB', GaussianNB())]
num_folds = 7
seed = 7
scoring = 'accuracy'
res =compare_models(models, X_train_over, y_train_over, scoring, num_folds)
res

7 fits failed out of a total of 7.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/migueloliveira/opt/anaconda3/envs/pythonProject2/lib/python3.10/site-packages/numpy/core/getlimits.py", line 459, in __new__
    dtype = numeric.dtype(dtype)
TypeError: 'NoneType' object is not callable

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/migueloliveira/opt/anaconda3/envs/pythonProject2/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/migueloliveira/opt/anaconda3/envs/pytho

KeyboardInterrupt: 

In [52]:
results = []
names = []
res = dict()
df_res = pd.DataFrame(columns=['model', 'score_accuracy'])
for name, model in models:
    kfold = KFold(n_splits=num_folds)
    cv_results = cross_val_score(model, X_train_over, y_train_over, cv=kfold, scoring=scoring)
    res[name] = cv_results
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std)
    print(msg)

7 fits failed out of a total of 7.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/migueloliveira/opt/anaconda3/envs/pythonProject2/lib/python3.10/site-packages/numpy/core/getlimits.py", line 459, in __new__
    dtype = numeric.dtype(dtype)
TypeError: 'NoneType' object is not callable

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/migueloliveira/opt/anaconda3/envs/pythonProject2/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/migueloliveira/opt/anaconda3/envs/pytho

TypeError: must be real number, not builtin_function_or_method