In [7]:
import pandas as pd
import seaborn as sns
sns.set()

In [8]:
from Twitter.analysis_v2.data_prediction_ml import get_test_train_data
from Twitter.analysis_v2.data_prediction_ml import prepare_model_data
from Twitter.analysis_v2.data_prediction_ml import split_data
from Twitter.analysis_v2.data_prediction_ml import compare_base_models
from Twitter.analysis_v2.data_prediction_ml import find_best_params
from Twitter.analysis_v2.data_prediction_ml import train_evaluate_save
from Twitter.analysis_v2.data_prediction_ml import tree_analysis
from Twitter.analysis_v2.data_prediction_ml import anova_analysis
from Twitter.analysis_v2.data_prediction_ml import chi_analysis
from Twitter.analysis_v2.data_prediction_ml import lasso_analysis
from Twitter.analysis_v2.data_prediction_ml import rfe_analysis
from Twitter.analysis_v2.data_prediction_ml import algorithm_comparison_chart
from Twitter.analysis_v2.data_prediction_ml import get_optimized_model

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler

In [10]:
BASE_FOLDER = "../../data/processed_tweets/"

num_vars = ['followers', 'following', 'tweet_count', 'seniority']
cat_vars = ['topics', 'sentiment', 'hashtags', 'verified', 'day_phase', 'day_of_week', 'month']
cat_vars_enc = ['topics_ids', 'sentiment_enc', 'hashtags_enc', 'verified_enc', 'day_phase_enc', 'day_of_week_enc', 'month_enc']

variables_to_predict = ['followers', 'following', 'tweet_count', 'seniority', 'verified_enc', 'day_phase_enc', 'day_of_week_enc', 'month_enc', 'topics_ids', 'sentiment_enc', 'hashtags_enc']

In [11]:
train_df, test_df = get_test_train_data(BASE_FOLDER, False)

[]


In [6]:
train_df = prepare_model_data(train_df)
test_df = prepare_model_data(test_df)

KeyError: 'topics_ids'

In [None]:
X_train, y_train, X_test, y_test = split_data(train_df, test_df)

In [None]:
def get_encoded_cols():
    encoded_cols = []
    for cat in cat_vars:
        content = [x for x in X_train.columns[X_train.columns.str.startswith(cat)].tolist() if (x not in cat_vars_enc) and (x not in cat_vars) and (x != 'topics_cleaned')]
        encoded_cols += content
    print(len(encoded_cols), "encoded cols found")
    return encoded_cols

In [None]:
cat_vars_encoded = get_encoded_cols()

In [None]:
for cat in cat_vars_encoded:
    X_train[cat] = X_train[cat].fillna(0)

## Standardization

In [None]:
X_train_cats = X_train[cat_vars_encoded]
scaler = StandardScaler().fit(X_train[num_vars])
X_train_num_scaled = scaler.transform(X_train[num_vars])
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns=X_train[num_vars].columns).set_index(X_train.index)

X_test_num_scaled = scaler.transform(X_test[num_vars])
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns=X_test[num_vars].columns).set_index(X_test.index)

## Feature Selection

In [None]:
num_feat_to_keep = pd.DataFrame(index=X_train_cats.columns)
cat_feat_to_keep = pd.DataFrame(index=X_train_num_scaled.columns)

### Categorical variables selection

#### Chi-square

In [None]:
chi_analysis(cat_feat_to_keep, X_train_cats, y_train)

### Numerical variables selection

#### ANOVA

In [None]:
anova_analysis(num_feat_to_keep, X_train_num_scaled, y_train)

#### Tree-based

In [None]:
tree_analysis(num_feat_to_keep, X_train_num_scaled, y_train)

#### LASSO Regression

In [None]:
lasso_analysis(num_feat_to_keep, X_train_num_scaled, y_train)

#### Recursive Feature Elimination

In [None]:
rfe_analysis(num_feat_to_keep, X_train_num_scaled, y_train)

### Final variables

In [None]:
num_feat_to_keep['Discard Nr'] = num_feat_to_keep.apply(lambda x: x.str.findall('Discard').str.len()).sum(axis = 1).astype(int)
num_feat_to_keep

In [None]:
cat_feat_to_keep['Discard Nr'] = cat_feat_to_keep.apply(lambda x: x.str.findall('Discard').str.len()).sum(axis = 1).astype(int)
cat_feat_to_keep

In [None]:
cat_to_keep = cat_feat_to_keep[cat_feat_to_keep['Discard Nr'] < 1].index.tolist()
num_to_keep = num_feat_to_keep[num_feat_to_keep['Discard Nr'] < 3].index.to_list()
print(cat_to_keep + num_to_keep)

In [None]:
X_train = X_train[cat_to_keep].copy()
X_train[num_to_keep] = X_train_num_scaled[num_to_keep]

In [None]:
X_test = X_test[cat_to_keep].copy()
X_test[num_to_keep] = X_test_num_scaled[num_to_keep]

## Balacing the dataset

### SMOTE Method

In [None]:
print("Before over sampling: ", Counter(y_train))
over_sample = SMOTE(random_state=7)
X_train, y_train = over_sample.fit_resample(X_train, y_train)
print("After over sampling: ", Counter(y_train))

## Models

In [None]:
models = [('LR', LogisticRegression(solver='lbfgs')), ('LDA', LinearDiscriminantAnalysis()), ('CART', DecisionTreeClassifier()), ('NB', GaussianNB()),
          ('AB', AdaBoostClassifier()), ('GBM', GradientBoostingClassifier()), ('RFC', RandomForestClassifier(n_estimators=100)), ('ET', ExtraTreesClassifier())]
num_folds = 5
seed = 7
scoring = 'accuracy'
best_base_model, names, results = compare_base_models(models, X_train, y_train, scoring, num_folds)
algorithm_comparison_chart('Comparação dos modelos base', 'modelos', 'precisão (0-1)', names, results, False)

## Hyperparameter Optimization

In [None]:
optimized_model = get_optimized_model(best_base_model[1], False, X_train, y_train)

## Test final model

In [None]:
train_evaluate_save(optimized_model, X_train, y_train, X_test, y_test)