In [1]:
import pandas as pd
import re
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


In [2]:
df_train  = pd.read_csv('train.csv')
df_test  = pd.read_csv('test.csv')

In [3]:
print(f'Train shape: {df_train.shape}')
print(f'Test shape: {df_test.shape}')

Train shape: (31962, 3)
Test shape: (17197, 2)


In [4]:
df_train.head(5)

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [5]:
def  clean_text(df, text_field):
    
    df[text_field] = df[text_field].str.lower()
    
    df[text_field] = df[text_field].apply(lambda x: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))  
    return df

In [6]:
df_train_clean = clean_text(df_train, "tweet")
df_test_clean = clean_text(df_test, "tweet")

In [7]:
train_majority = df_train_clean[df_train_clean.label==0]
train_minority = df_train_clean[df_train_clean.label==1]

In [8]:
train_minority_upsampled = resample(train_minority, 
                                 replace=True,    
                                 n_samples=len(train_majority),   
                                 random_state=123)

In [9]:
train_upsampled = pd.concat([train_minority_upsampled, train_majority])
train_upsampled['label'].value_counts()

1    29720
0    29720
Name: label, dtype: int64

In [10]:
pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(train_upsampled['tweet'],
                                                    train_upsampled['label'],random_state = 0)

In [12]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [13]:
f1_score(y_test, y_predict)

0.9693415089309518

In [18]:
def predict (tweet) :
    
    tweet = tweet.lower()
    tweet = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", tweet)  
    predcition = model.predict([tweet])
    if ((predcition[0]) == 1 ) : return('HATE SPEECH')
    else : return ('NATURAL SPEECH')
    

In [20]:
inputMessage = input('Enter a message :')
print('Your message :',inputMessage)
print('Output :', predict (inputMessage))

Your message : all pay meeting to tackle a movie which uncovers a problem we don't want the world to know.  
Output : NATURAL SPEECH
