In [None]:
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pickle

In [None]:
train = pd.read_csv("./data/final/train.csv")
dev = pd.read_csv("./data/final/dev.csv")

In [None]:
allocine_infos = pd.read_csv("./data/externe/allocine_movies.csv", sep=',')

temp = train.merge(right=allocine_infos, right_on='id', left_on='movie', how='left')
temp_train = temp.drop(['id','press_rating','number_of_press_rating','spec_rating','number_of_spec_rating'], axis=1)
temp_train = temp_train.dropna()

temp_dev = dev.merge(right=allocine_infos, right_on='id', left_on='movie', how='left')
temp_dev = temp_dev.drop(['id','press_rating','number_of_press_rating','spec_rating','number_of_spec_rating'], axis=1)
temp_dev = temp_dev.dropna()

In [None]:
temp_train["release_year"] = temp_train["release_date"].apply(lambda x: x.split('-')[0])
temp_train["release_month"] = temp_train["release_date"].apply(lambda x: x.split('-')[1])
temp_dev["release_year"] = temp_dev["release_date"].apply(lambda x: x.split('-')[0])
temp_dev["release_month"] = temp_dev["release_date"].apply(lambda x: x.split('-')[1])

In [None]:
tfidf = TfidfVectorizer(
    analyzer = 'word',
    lowercase = False,
    ngram_range = (1,2)
)

tfidf2 = TfidfVectorizer(
    stop_words = stopwords.words("french"),
    analyzer = 'word',
    lowercase = True,
    ngram_range = (1,2)
)

cvec = CountVectorizer(
    analyzer = 'word',
    lowercase = False,
    ngram_range = (1,3)
)

preprocessor = ColumnTransformer(transformers=[
    ('tfidf1', cvec, "commentaire_clean"),
    # ('tfidf2', tfidf2, "summary"),
    # ('countvect1', tfidf, "title"),
    # ('countvect2', tfidf, "genres"),
    # ('countvect3', tfidf, "actors"),
    # ("countvect4", tfidf, "directors"),
    # ("scaler2", StandardScaler(), ["duration"]),
    # ("scaler21", StandardScaler(), ["release_year"]),
    # ("scaler22", StandardScaler(), ["release_month"]),
    # ("scaler3", OneHotEncoder(handle_unknown = 'ignore'), ["nationality"]),
    ("scaler4", MinMaxScaler(), ["sentimental_score"]),
])

In [None]:
model = make_pipeline(preprocessor, SGDClassifier(loss = "hinge", penalty = "l2"))
model.fit(temp_train, temp_train["note"].astype("string"))

In [None]:
y_pred = model.predict(temp_dev)
print(accuracy_score(temp_dev["note"].astype("string"),y_pred))

In [None]:
preprocessor.transformers[1][1].weight = 2
logistic_regression = LogisticRegression(penalty='l2', C=0.1, solver='sag', max_iter=500)
model2 = make_pipeline(preprocessor, logistic_regression)
model2.fit(temp_train, temp_train["note"].astype("string"))

In [None]:
y_pred = model2.predict(temp_dev)
print(accuracy_score(temp_dev["note"].astype("string"),y_pred))

In [None]:
# Save Models
pickle.dump(model, open("SGDClassifier_com1-3_sentiscore.pickle", "wb"))
pickle.dump(model2, open("LogisticRegression_com1-3_sentiscore.pickle", "wb"))

In [None]:
# Load and test Model

test_model = pickle.load(open("LogisticRegression_com1-3_sentiscore.pickle", "rb"))
test_model2 = pickle.load(open("SGDClassifier_com1-3_sentiscore.pickle", "rb"))
y_pred = test_model.predict(temp_dev)
print(accuracy_score(temp_dev["note"].astype("string"),y_pred))
y_pred = test_model2.predict(temp_dev)
print(accuracy_score(temp_dev["note"].astype("string"),y_pred))