In [34]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

load twitter dataset

In [32]:
data = pd.read_csv('dataset/train.csv')
data.info()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
train, test = train_test_split(data, test_size=0.2, stratify=data['label'], random_state=21)

train.shape, test.shape

((25569, 3), (6393, 3))

build  tf-idf model and fit train dataset

In [7]:
tfidf_vectorizer = TfidfVectorizer(lowercase=True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)
tfidf_vectorizer.fit(train.tweet)


TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

transform the train and test data tweets

In [14]:
train_idf = tfidf_vectorizer.transform(train.tweet) # X values
test_idf = tfidf_vectorizer.transform(test.tweet)

fit model with train data

In [15]:
model = LogisticRegression()
model.fit(train_idf, train.label)

LogisticRegression()

use the predict() function that uses the trained model to generate the predictions


In [19]:
# predict model on training data
predict_train = model.predict(train_idf)
# predict model on test data
predict_test = model.predict(test_idf)
# f1 score on train data
f1_score(y_true= train.label, y_pred=predict_train)
# f1 score on test data
f1_score(y_true=test.label, y_pred=predict_test)

0.45751633986928114

In [39]:
# define stages of sci-kit learn pipelines
pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer(lowercase=True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)), ('model', LogisticRegression())])
# fit pipeline model with the training data
pipeline.fit(train.tweet, train.label)

0        0
1        0
2        0
3        0
4        0
        ..
31957    0
31958    0
31959    0
31960    1
31961    0
Name: label, Length: 31962, dtype: int64

In [26]:
# test pipeline with a sample tweet
text = ['the weather is pretty bad i do not want to go out']

# predict the label using pipeline
pipeline.predict(text)

array([0])

In [27]:
from joblib import dump
# dump the pipeline model to a new file
dump(pipeline, filename='text_classification.joblib')

['text_classification.joblib']