In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import RidgeClassifier
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

class Voting(object):
    
    def __init__(self):    
        self = self

    def vectorize(self, comments):
        corpus = []
        for sentences in comments:
            corpus.append(sentences)        
        vectorizer = TfidfVectorizer(ngram_range=(1,2), max_df=0.9, min_df=0.01, max_features=1000) 
        vectors = vectorizer.fit_transform(corpus)
        feature_names = vectorizer.get_feature_names_out()
        dense = vectors.todense()
        denselist = dense.tolist()
        df = pd.DataFrame(denselist, columns=feature_names)
        return df

    def fit(self, comments, score):
        comments = self.vectorize(comments)
        X_train, X_test, y_train, y_test = train_test_split(comments, score, test_size=0.2, random_state=0)

        classifiers = {
            'Logistic Regression': LogisticRegression(random_state=0, max_iter=1000),
            'Linear Discriminant Analysis': LinearDiscriminantAnalysis(),
            'SVM': LinearSVC(),
            'PLS Regression': PLSRegression(n_components=3),
            'Ridge Classifier': RidgeClassifier()
        } 

        coefficients = {}
        for name, clf in classifiers.items():
            clf.fit(X_train, y_train)
            if name == 'PLS Regression': 
                coefficients[name] = clf.coef_
            else:
                coefficients[name] = clf.coef_[0]
        #coefficients = pd.DataFrame(coefficients)
        #for key, value in coefficients.items():
        #    print(f"The length of the array in key '{key}' is {len(value)}")

        votes_df = pd.DataFrame(index=comments.columns, columns=classifiers.keys())
        #print(votes_df)
        
        for feature in comments.columns:
            for clf_name, coef in coefficients.items():
                if coef[comments.columns.get_loc(feature)] > 0:
                    votes_df.loc[feature, clf_name] = "Positive"
                else:
                    votes_df.loc[feature, clf_name] = "Negative"
        votes_df['Final Vote'] = votes_df.mode(axis=1)[0]
        print("Votes based on coefficients:")
        return votes_df


In [47]:
base = pd.read_csv('netflix.csv', sep = ";")
base['note_binary'] = base['note'].apply(lambda x: 1 if x > 3 else 0)
base

Unnamed: 0,commentaire,note,note_eval,note_binary
0,Netflix ne considère pas ses clients Le servic...,1,1,0
1,Toujours pareil les mêmes films séries C'est u...,2,2,0
2,j'adore tout simplement. juste un peu cher à m...,4,4,1
3,J'avais le forfait standard avec une image de ...,1,1,0
4,"Des voleurs Tout simplement, des voleurs. Surv...",1,1,0
...,...,...,...,...
97,"On parle de Netflix et ses séries ""originales""...",5,5,1
98,SUITE ET FIN FLIXBIP et oui le respect des oeu...,1,3,0
99,certes c est bien on peut regarder des films o...,1,3,0
100,"Vraiment Netflix au TOP qualité, quantité, tre...",5,5,1


In [73]:
model = Voting()
model.fit(base['commentaire'], base['note_binary'])

Votes based on coefficients:


Unnamed: 0,Logistic Regression,Linear Discriminant Analysis,SVM,PLS Regression,Ridge Classifier,Final Vote
11,Positive,Positive,Positive,Positive,Positive,Positive
12,Negative,Negative,Negative,Negative,Negative,Negative
13,Negative,Negative,Negative,Negative,Negative,Negative
abandonne,Negative,Negative,Negative,Negative,Negative,Negative
abonnement,Positive,Negative,Positive,Positive,Positive,Positive
...,...,...,...,...,...,...
épisode,Positive,Positive,Positive,Positive,Positive,Positive
épisodes,Negative,Negative,Negative,Negative,Negative,Negative
était,Negative,Negative,Negative,Negative,Negative,Negative
été,Negative,Positive,Negative,Negative,Negative,Negative


In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.inspection import permutation_importance

classifiers = {
    "RandomForest": RandomForestClassifier(random_state=0, max_depth=5),
    "AdaBoost": AdaBoostClassifier(),
    "MLP": MLPClassifier(),
    "MultinomialNB": MultinomialNB()
}

top_features = {}

for clf_name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    result = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=0)
    sorted_indices = np.argsort(result.importances_mean)[::-1]
    top_features[clf_name] = [feature_names[i] for i in sorted_indices[:100]]

intersection_features = set.intersection(*map(set, top_features.values()))

intersection_features