In [18]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [19]:
from sklearn.semi_supervised import SelfTrainingClassifier, LabelPropagation
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.semi_supervised import LabelSpreading
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

In [20]:
! pip install langdetect



In [21]:
from langdetect import detect

In [22]:
df_unknown = pd.read_csv('/content/gdrive/My Drive/Mestrado/semi_supervised_learning.csv')

def detect_en(text):
    try:
        return detect(text) == 'en'
    except:
        return False

df_unknown = df_unknown[df_unknown['text'].apply(detect_en)]
df_unknown

Unnamed: 0,Id,ThreadId,MotherId,text,user,date
0,1,2,,"For what it's worth, here is an article debunk...",134578,1640825191
1,2,2,,Has he been tested for covid? I ask because c...,770719,1640883616
2,3,2,,I've heard of various problems related to the ...,163305,1640995029
3,4,2,3.0,One more thought would be to report it to VAER...,163305,1640995161
4,5,2,3.0,VAERS is an unscientific self reporting system...,19040740,1641054812
...,...,...,...,...,...,...
29278,29279,6478,,Thanks so much for sharing this info. I am on ...,1528343,1291785024
29279,29280,6478,,for those of you that are taking Ritalin how...,1525654,1291975892
29280,29281,6478,,Thank you for you tips as well.I will definetl...,1596032,1297297784
29281,29282,6478,,My doc just changed my meds yesterday. I was t...,1712318,1308397948


In [23]:
import pandas as pd
df = pd.read_csv('/content/gdrive/My Drive/Mestrado/ready.csv')

In [24]:
df.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'Unnamed: 0.1.1.1',
       'ID', 'labels', 'text', 'sentiment', 'text_w_tags', 'text_not_abbr',
       'text_contr', 'text_corrected', 'lower_case', 'no_punct', 'sw_nltk',
       'sw_nltkp', 'sw_spacy', 'sw_spacyp', 'porter_sw_nltk',
       'porter_sw_nltkp', 'porter_sw_spacy', 'porter_sw_spacyp',
       'porter_no_sw', 'porter_vanilla', 'lemma_sw_nltk', 'lemma_sw_nltkp',
       'lemma_sw_spacy', 'lemma_sw_spacyp', 'lemma_no_sw', 'lemma_vanilla',
       'porter_sw_nltk_e', 'porter_sw_nltk_ne', 'porter_sw_nltkp_e',
       'porter_sw_nltkp_ne', 'porter_sw_spacy_e', 'porter_sw_spacy_ne',
       'porter_sw_spacyp_e', 'porter_sw_spacyp_ne', 'porter_no_sw_e',
       'porter_no_sw_ne', 'porter_vanilla_e', 'porter_vanilla_ne',
       'lemma_sw_nltk_e', 'lemma_sw_nltk_ne', 'lemma_sw_nltkp_e',
       'lemma_sw_spacy_e', 'lemma_sw_spacy_ne', 'lemma_sw_spacyp_e',
       'lemma_sw_spacyp_ne', 'lemma_no_sw_e', 'lemma_no_sw_ne',
       'lemma_van

In [25]:
df.sentiment = pd.Categorical(pd.factorize(df.sentiment)[0] + 1)

In [26]:
df_unknown['sentiment'] = -1

In [27]:
df_unknown = df_unknown.sample(10000)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df.sentiment)

In [29]:
df_complete_text = pd.concat([X_train, df_unknown['text']])

In [30]:
df_complete_labels = pd.concat([y_train, df_unknown.sentiment])

In [31]:
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log")
vectorizer_params = dict(ngram_range=(1, 3), min_df=5, max_df=0.8)

In [32]:

# SelfTraining Pipeline
st_pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
    ]
)
# LabelSpreading Pipeline
ls_pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        # LabelSpreading does not support dense matrices
        ("todense", FunctionTransformer(lambda x: x.todense())),
        ("clf", LabelSpreading()),
    ]
)


In [33]:
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

    print("-" * 10)
    print()


In [34]:
eval_and_print_metrics(ls_pipeline, df_complete_text, df_complete_labels, X_test, y_test)

Number of training samples: 10151
Unlabeled samples in training set: 10000




              precision    recall  f1-score   support

           1       0.00      0.00      0.00        11
           2       0.56      0.97      0.71        29
           3       0.00      0.00      0.00        11

    accuracy                           0.55        51
   macro avg       0.19      0.32      0.24        51
weighted avg       0.32      0.55      0.40        51

----------



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
