In [1]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('twitter_sentiments.csv')
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
data.shape

(31962, 3)

In [6]:
train,test = train_test_split(data,test_size=0.2,stratify=data['label'],random_state=21)
train.shape,test.shape

((25569, 3), (6393, 3))

In [7]:
tfidf_vectorizer = TfidfVectorizer(lowercase = True,max_features=1000,stop_words = ENGLISH_STOP_WORDS)
tfidf_vectorizer.fit(train.tweet)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=frozenset({'before', 'because', 'beforehand', 'during', 'four', 'has', 'yourself', 'once', 'besides', 'former', 'into', 'first', 'fire', 'otherwise', 'done', 'rather', 'serious', 'being', 'ever', 'becoming', 'everything', 'onto', 'noone', 'take', 'all', 'a', 'seem', 'been', 'whose', 'keep..., 'whenever', 'from', 'they', 'am', 'though', 'their', 'further', 'any', 'even', 'nobody', 'every'}),
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [8]:
train_idf = tfidf_vectorizer.transform(train.tweet)
test_idf = tfidf_vectorizer.transform(test.tweet)

In [9]:
model_LR = LogisticRegression()

model_LR.fit(train_idf,train.label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
predict_train = model_LR.predict(train_idf)

In [11]:
predict_test = model_LR.predict(test_idf)

In [12]:
f1_score(y_true=train.label,y_pred=predict_train)

0.48821414302836597

In [13]:
f1_score(y_true=test.label,y_pred=predict_test)

0.45751633986928114

In [14]:
pipeline = Pipeline(steps=[('tfidf',TfidfVectorizer(lowercase=True,max_features = 1000,
                                                   stop_words = ENGLISH_STOP_WORDS)),('model',LogisticRegression())])

In [15]:
pipeline.fit(train.tweet,train.label)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [17]:
text = ["Virat Kohli,AB de Villiers set to auction their 'Green Day' kits from 2016 IPL match to raise funds"]
pipeline.predict(text)

array([0], dtype=int64)

In [19]:
from joblib import dump
dump(pipeline,filename="text_classification.joblib")

['text_classification.joblib']