In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.svm import SVC
from sklearn import datasets
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from stop_words import get_stop_words
from joblib import dump, load
from sklearn.model_selection import train_test_split


# Regex pour garder uniquement les chiffre et les lettres

In [2]:
df = pd.read_csv("labels.csv", usecols=['class', 'tweet'])

In [3]:
df.head()

Unnamed: 0,class,tweet
0,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


# Cleannage Data

In [4]:
df['tweet'] = df['tweet'].apply(lambda tweet: re.sub('[^A-Za-z\']+', ' ',tweet))

In [6]:
df

Unnamed: 0,class,tweet
0,2,RT mayasolovely As a woman you shouldn't comp...
1,1,RT mleew boy dats cold tyga dwn bad for cuffi...
2,1,RT UrKindOfBrand Dawg RT sbaby life You ever ...
3,1,RT C G Anderson viva based she look like a tr...
4,1,RT ShenikaRoberts The shit you hear about me ...
...,...,...
24778,1,you's a muthaf in lie LifeAsKing Pearls corey ...
24779,2,you've gone and broke the wrong heart baby and...
24780,1,young buck wanna eat dat nigguh like I aint fu...
24781,1,youu got wild bitches tellin you lies


In [7]:
df.tweet.describe()

count                       24783
unique                      24764
top       These hoes ain't loyal 
freq                            4
Name: tweet, dtype: object

# Model Persistence

In [8]:
clf = make_pipeline(
    TfidfVectorizer(stop_words=get_stop_words('en')),
    OneVsRestClassifier(SVC(kernel='linear', probability=True))
)

In [9]:
X = df['tweet']
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.80, random_state=42)

In [10]:
clf.fit(X=X_train,y=y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(stop_words=['a', 'about', 'above', 'after',
                                             'again', 'against', 'all', 'am',
                                             'an', 'and', 'any', 'are',
                                             "aren't", 'as', 'at', 'be',
                                             'because', 'been', 'before',
                                             'being', 'below', 'between',
                                             'both', 'but', 'by', "can't",
                                             'cannot', 'could', "couldn't",
                                             'did', ...])),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [11]:
s = pickle.dumps(clf)
clf2 = pickle.loads(s)
print(df['tweet'][0:10][0])
clf2.predict(X[0:10])

 RT mayasolovely As a woman you shouldn't complain about cleaning up your house amp as a man you should always take the trash out 


array([2, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [22]:
dump(clf2,"algo_classes.joblib")

['algo_classes.joblib']