In [2]:
## Score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

## Models
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


In [3]:
import pandas as pd
from data.data import Data

data_raw = Data().get_all_data()

In [4]:
# df1 is fine for PreProcessing


df1 = data_raw['twitter_MBTI']
df1.drop(df1.columns[0], axis=1, inplace=True)
df1 = df1.rename(columns={'text': 'posts', 'label': 'type'})

df2 = data_raw['MBTI 500']

df3 = data_raw['mbti_1']
df3 = df3[['posts', 'type']]


## Combined all data for PP if desired
data_combined = pd.concat([df1,df1,df3], axis=0)
data_combined.head()

Unnamed: 0,posts,type
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj
1,@Hispanthicckk Being you makes you look cute||...,intj
2,@Alshymi Les balles sont réelles et sont tirée...,intj
3,"I'm like entp but idiotic|||Hey boy, do you wa...",intj
4,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj


In [4]:
# from scripts.preprocessing import preprocessing

# data_pp = preprocessing(data_combined)

# TypeError: expected string or bytes-like object

In [5]:
## Data for model selection
data = df2

In [6]:
data['e_i'] = data['type'].astype(str).str[0]
data['s_n'] = data['type'].astype(str).str[1]
data['f_t'] = data['type'].astype(str).str[2]
data['p_j'] = data['type'].astype(str).str[3]

column_list = list(data.columns)[2:]
column_list

['e_i', 's_n', 'f_t', 'p_j']

In [10]:
models_list = [SGDClassifier(), 
               RandomForestClassifier(max_depth=5, n_estimators=10, max_features=2), 
            #    AdaBoostClassifier(),
               DecisionTreeClassifier(max_depth=5), 
               KNeighborsClassifier(3),
            #    SVC(kernel="linear", C=0.025),
               GaussianNB(), 
               MultinomialNB(),
               QuadraticDiscriminantAnalysis()]

In [25]:
def BaselineModel(X_train, X_test, y_train, model_selection):
    count_vectorizer = CountVectorizer()

    X_bow_train = count_vectorizer.fit_transform(X_train)
    X_bow_test = count_vectorizer.fit_transform(X_test)


    ## Apparently, you need to fill out these to arrays to run them through
    ## Some of the beefier models, try to figure out how
    
    # X_train = X_bow_train.toarray()
    # X_bow_test = X_bow_test.toarray()

    model = model_selection

    model.fit(X_bow_train, y_train)
    
    return model, X_bow_test

In [12]:
## Finds the Macro F1 - Score
def PredictDict(model, X_bow_test, y_test):
    y_pred = cross_val_predict(model, X_bow_test, y_test, cv = 10)
    report = classification_report(y_test, y_pred, output_dict=True)
    return report

For the function below, see if you can modify it to return print functions instead of just doing them, they seem to replicate in the output and it looks ugly.

In [26]:
import numpy as np

## Use our previous code to get F1 Scores for different models
def ModelF1Score_Binary(column, model, dataset):
    """
    Input the Column from the DataFrame as a string for binary modelling.
    Outputs prints of the Confusion Matrix and the Classification Report.
    """
    y_set = dataset[column]
    X_set = dataset['posts']
    
    
    X_train, X_test, y_train, y_test = train_test_split(X_set,y_set,
                                                    test_size=.3,
                                                    random_state=1) 
    
    
    
    model_fit, X_bow_test = BaselineModel(X_train = X_train, 
                                      X_test = X_test, 
                                      y_train = y_train, 
                                      model_selection = model)
    
    print(f"\n{model} has been fit!")
    
    prediction = PredictDict(model = model_fit, 
                       X_bow_test = X_bow_test, 
                       y_test = y_test)
    
    predict_list = list(prediction.keys())
    
    class_1 = predict_list[0]
    class_2 = predict_list[1]
    macro = predict_list[3]
    
    print("----------------------------------------------------------------")
    print(f"{class_1} has a score of: {100*round(prediction[class_1]['f1-score'],4)}%")
    print(f"{class_2} has a score of: {100*round(prediction[class_2]['f1-score'],4)}%")
    print(f"The Combined Macro F1 Average is: {100*round(prediction[macro]['f1-score'],4)}%")
    print("----------------------------------------------------------------")
    

In [37]:
ModelF1Score_Binary(column = column_list[1], 
                    model = models_list[-1], 
                    dataset = data)


MultinomialNB() has been fit!
----------------------------------------------------------------
N has a score of: 96.59%
S has a score of: 49.21%
The Combined Macro F1 Average is:72.89999999999999%
----------------------------------------------------------------


In [40]:
## For loop for all the types?

## Okay well it prints everything a few too many times, 
## but it does get the scores out

for type in column_list:
    ModelF1Score_Binary(column = type,
                        model = models_list[-1],
                        dataset = data)


MultinomialNB() has been fit!
----------------------------------------------------------------
E has a score of: 58.57%
I has a score of: 90.41%
The Combined Macro F1 Average is:74.49%
----------------------------------------------------------------

MultinomialNB() has been fit!
----------------------------------------------------------------
N has a score of: 96.59%
S has a score of: 49.21%
The Combined Macro F1 Average is:72.89999999999999%
----------------------------------------------------------------

MultinomialNB() has been fit!
----------------------------------------------------------------
F has a score of: 83.95%
T has a score of: 91.17%
The Combined Macro F1 Average is:87.56%
----------------------------------------------------------------

MultinomialNB() has been fit!
----------------------------------------------------------------
J has a score of: 78.86999999999999%
P has a score of: 84.75%
The Combined Macro F1 Average is:81.81%
-------------------------------------

In [14]:
## Maybe saving the results as a dict for the return and then
# calling them out afterwards is better use of the function?




ModelF1Score_Binary(column = column_list[0],
                    model = models_list[0],
                    dataset = data)


SGDClassifier() has been fit!
----------------------------------------------------------------
E has a score of: 71.97%
I has a score of: 91.41%
The Combined Macro F1 Average is: 81.69%
----------------------------------------------------------------


In [21]:
ModelF1Score_Binary(column = column_list[0],
                    model = models_list[1],
                    dataset = data)


RandomForestClassifier(max_depth=5, max_features=2, n_estimators=10) has been fit!


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


----------------------------------------------------------------
E has a score of: 0.0%
I has a score of: 86.33%
The Combined Macro F1 Average is: 43.16%
----------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
ModelF1Score_Binary(column = column_list[0],
                    model = models_list[2],
                    dataset = data)


DecisionTreeClassifier(max_depth=5) has been fit!
----------------------------------------------------------------
E has a score of: 61.1%
I has a score of: 89.82%
The Combined Macro F1 Average is: 75.46000000000001%
----------------------------------------------------------------


In [16]:
ModelF1Score_Binary(column = column_list[0],
                    model = models_list[3],
                    dataset = data)


KNeighborsClassifier(n_neighbors=3) has been fit!
----------------------------------------------------------------
E has a score of: 39.550000000000004%
I has a score of: 86.21%
The Combined Macro F1 Average is: 62.88%
----------------------------------------------------------------


In [23]:
ModelF1Score_Binary(column = column_list[0],
                    model = models_list[4],
                    dataset = data)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/aforbesj/.pyenv/versions/3.10.6/envs/post2personality/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_2999/752105198.py", line 1, in <module>
    ModelF1Score_Binary(column = column_list[0],
  File "/tmp/ipykernel_2999/3475362615.py", line 20, in ModelF1Score_Binary
    model_fit, X_bow_test = BaselineModel(X_train = X_train,
  File "/tmp/ipykernel_2999/3399536739.py", line 3, in BaselineModel
    X_bow_train = count_vectorizer.fit_transform(X_train)
  File "/home/aforbesj/.pyenv/versions/3.10.6/envs/post2personality/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line 1388, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
  File "/home/aforbesj/.pyenv/versions/3.10.6/envs/post2personality/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line -1

In [19]:
ModelF1Score_Binary(column = column_list[0],
                    model = models_list[5],
                    dataset = data)


MultinomialNB() has been fit!
----------------------------------------------------------------
E has a score of: 58.57%
I has a score of: 90.41%
The Combined Macro F1 Average is: 74.49%
----------------------------------------------------------------


In [27]:
ModelF1Score_Binary(column = column_list[0],
                    model = models_list[6],
                    dataset = data)

MemoryError: Unable to allocate 116. GiB for an array with shape (74246, 209152) and data type int64