In [1]:
import gc

import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [2]:
Xy_data = pd.read_feather("data/labeled_data_clean.feather")
Xy_data["comment"] = Xy_data["comment"].transform(list)

Xy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704019 entries, 0 to 704018
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   comment  704019 non-null  object
 1   class    704019 non-null  int8  
dtypes: int8(1), object(1)
memory usage: 6.0+ MB


In [3]:
Xy_data["class"].value_counts()

1    313661
0    199718
2    190640
Name: class, dtype: int64

In [4]:
for i in range(5):
    Xy_data = Xy_data.sample(Xy_data.shape[0], replace=False)
    
X_train, X_test, y_train, y_test = train_test_split(Xy_data["comment"],
                                                    Xy_data["class"],
                                                    train_size=0.7)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (492813,)
y_train shape: (492813,)
X_test shape: (211206,)
y_test shape: (211206,)


In [5]:
tfidf_model = TfidfVectorizer(dtype = np.float32, min_df = 10)
tfidf_model.fit(Xy_data["comment"].apply(" ".join))

X_train = tfidf_model.transform(X_train.apply(" ".join))
X_test = tfidf_model.transform(X_test.apply(" ".join))

print(f"New shape X_train: {X_train.shape}")
print(f"New shape X_test: {X_test.shape}")

New shape X_train: (492813, 27110)
New shape X_test: (211206, 27110)


In [None]:
clf_model = SVC(kernel="linear", verbose=True) 
clf_model.fit(X_train, y_train)

[LibSVM]..

In [None]:
run_experimental = False

if run_experimental:

    # Machine/deep learning parameters
    model_parameters = {

        "svm-linear": {"kernel":"linear"},
        "random-forest": {},
        "xgboost": {"n_estimators": 500,
                    "objective": 'multi:softprob',
                    "num_class": 3,
                    "use_label_encoder": False,
                    "eval_metric":"mlogloss"},

    }

    # Machine/deep learning options
    model_options = {

        "svm-linear": SVC,
        "random-forest": RandomForestClassifier,
        "xgboost": xgb.XGBClassifier

    }

    sample = 2000

    # Iterate over the data sets
    for key_data, values_data in data_options.items():
        
        mask_train_test = values_data["type"] == "labeled"

        if sample:
            train_test_data = values_data[mask_train_test].sample(sample)
        else:
            train_test_data = values_data[mask_train_test]

        # X and y to train/test later. Getting only labeled data
        X, y = train_test_data["X"], train_test_data["y"]
        y = y.transform(int).to_numpy().reshape(-1,)

        # Train/Test split 80/20
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            train_size = 0.8,
                                                            random_state = 42)

        # Removing variables no longer used and free memory
        del X, y, mask_train_test
        gc.collect()

        # Iterate over the vectorization options
        for key_vec, vec_func in vectorization_options.items():

            # We'll use all data to fit the vectorization, not only the labeled
            input_data_vec = vectorization_preprocessing[key_vec](values_data["X"])
            vectorization_obj_fitted = vec_func()
            vectorization_obj_fitted.fit(input_data_vec)

            # We'll only transform the labeled data
            input_data_transform_train = vectorization_preprocessing[key_vec](X_train)
            input_data_transform_test = vectorization_preprocessing[key_vec](X_test)

            X_train_vectorized = vectorization_obj_fitted.transform(input_data_transform_train)
            X_test_vectorized = vectorization_obj_fitted.transform(input_data_transform_test)

            if vectorization_processing[key_vec] is not None:
                X_train_vectorized = vectorization_processing[key_vec](X_train_vectorized)
                X_test_vectorized = vectorization_processing[key_vec](X_test_vectorized)

            # Iterate over the machine/deep learning options
            for key_model, model_obj in model_options.items():

                # Initializing model with the respective parameters
                
                model_clf = model_obj(**model_parameters[key_model])

                model_clf.fit(X_train_vectorized, y_train)

                y_hat = model_clf.predict(X_test_vectorized)
                
                report = classification_report(y_test,
                                               y_hat,
                                               labels=[0,1,2],
                                               target_names=["Negative","Neutral","Positive"])

                matrix_report = confusion_matrix(y_test, y_hat, labels=[0, 1, 2])
                
                print(f"data: {key_data}")
                print(f"vectorization: {key_vec}")
                print(f"model: {key_model}")
                print(report)
                print(f"\n{matrix_report}")
                print("-" * 60)