<h1>Imports</h1>

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pandas as pd
from nltk.corpus import stopwords
import emoji as em

<h1>Import des données</h1>

In [None]:
data_train = pd.read_xml("./data/apprentissage/train.xml")
data_train = data_train.dropna()

data_dev = pd.read_xml("./data/apprentissage/dev.xml")
data_dev = data_dev.dropna()

data_test = pd.read_xml("./data/test/test.xml")
data_test = data_test.dropna()

<h1>Fonction de vectorisation et prédiction</h1>

In [None]:
def run_SGDClassifier(train_x_data, y_train, dev_x_data, action, ngram):
    if action == "tfidf":
        tfidf = TfidfVectorizer(
            stop_words = stopwords.words("french"),
            analyzer = 'word',
            lowercase = True,
            ngram_range = ngram
        )

        X_train = tfidf.fit_transform(train_x_data)
        X_dev = tfidf.transform(dev_x_data)
        
    elif action == "cvec":
        cvec = CountVectorizer(
            stop_words = stopwords.words("french"),
            analyzer = 'word',
            lowercase = True,
            ngram_range = ngram
        )
    
        X_train = cvec.fit_transform(train_x_data)
        X_dev = cvec.transform(dev_x_data)
        
    
    clf = SGDClassifier(loss = "hinge", penalty = "l2")
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_dev)
    
    return y_pred

<h1>Prédictions avec données normales. X = [commentaires]</h1>

In [None]:
y_pred = run_SGDClassifier(data_train["commentaire"], data_train["note"], data_dev["commentaire"], "tfidf", (1,1))
accuracy_score(data_dev["note"], y_pred)

In [None]:
y_pred = run_SGDClassifier(data_train["commentaire"], data_train["note"], data_dev["commentaire"], "tfidf", (1,2))
accuracy_score(data_dev["note"], y_pred)

In [None]:
y_pred = run_SGDClassifier(data_train["commentaire"], data_train["note"], data_dev["commentaire"], "tfidf", (2,2))
accuracy_score(data_dev["note"], y_pred)

In [None]:
y_pred = run_SGDClassifier(data_train["commentaire"], data_train["note"], data_dev["commentaire"], "cvec", (1,1))
accuracy_score(data_dev["note"], y_pred)

In [None]:
y_pred = run_SGDClassifier(data_train["commentaire"], data_train["note"], data_dev["commentaire"], "cvec", (1,2))
print(accuracy_score(data_dev["note"], y_pred))

In [None]:
# On enregistre les prédictions pour le meilleur modele pour les envoyer sur la plateforme d'évaluation

# y_pred = run_SGDClassifier(data_train["commentaire"], data_train["note"], data_test["commentaire"], "cvec", (1,2))

# file = pd.DataFrame()
# file["review_id"]=data_test["review_id"]
# file["note"]=y_pred
# file = pd.concat([file, pd.DataFrame.from_records([{"review_id": "review_27303387", "note" : "2,0"}])])
# file = file.sort_values(by=["review_id"])
# file.to_csv("./resultats.txt", header=None, index=None, sep=' ', mode='w')

In [None]:
y_pred = run_SGDClassifier(data_train["commentaire"], data_train["note"], data_dev["commentaire"], "cvec", (2,2))
accuracy_score(data_dev["note"], y_pred)

<h1>Prédictions avec lemmatisation. X = [commentaires]</h1>

In [None]:
lemmatized = pd.read_csv("./data/processed/lemmatized.csv")
lemmatized = lemmatized.dropna()

In [None]:
y_pred = run_SGDClassifier(lemmatized["lemmatized"], data_train["note"] , data_dev["commentaire"], "tfidf", (1,2))
accuracy_score(data_dev["note"], y_pred)