In [None]:
# importing all necessary libraries to run the code
import re,string
import numpy as np
import pandas as pd
import keras_metrics
import tensorflow.keras
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,Embedding
# using the variable sw to hold all stopwords that are in English
sw = stopwords.words('english')

In [None]:
#reading dataset
#data=pd.read_csv('gender-classifier.csv',encoding = "ISO-8859-1")
data=pd.read_csv('train.csv')

In [None]:
data.label.value_counts()

In [None]:
print(data.isnull().sum())
data.head(2)

In [None]:
#applying pre-processing steps to remove stopwords and words of size less than 2
data['tweet'] = data['tweet'].apply(lambda x: x.split())
wordsEng = stopwords.words('english')
data['tweet'] = data['tweet'].apply(lambda x:[item for item in x if item not in wordsEng])
data['tweet'] = data['tweet'].apply(lambda x: [w for w in x if len(w)>2])
data['tweet'] = data['tweet'].apply(lambda x: " ".join(x))
data.head(2)

In [None]:
#removing airline company names as pre-processing
testList=[]
def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)


for t in range(len(data)):
    testList.append(strip_all_entities(strip_links(data['tweet'][t])))

In [None]:
#converting testList[] list into dataframe for further processing
dat = pd.DataFrame(np.array(testList))
dat.head(5)

In [None]:
# applying pre-processing to remove special symbols, numbers and converting into lower case
sw = stopwords.words('english')
twitterSentiment=[]
for i in range(len(dat)):
    review = re.sub('[^a-zA-Z]', ' ',dat[0][i])
    review = re.sub('[/(){}\[\]\|@!,;]', ' ',dat[0][i])
    review = re.sub('[^0-9a-zA-Z #+_♥️]', ' ',dat[0][i])#Remove bad symbols
    
    review = re.sub(r'\d+', '',review)
    review = review.lower()
    review = review.split()

    review = [token for token in review if token not in sw]
    review=' '.join(review)
    review=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",review).split())
    twitterSentiment.append(review)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
count_vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_train = count_vectorizer.fit_transform(X_train)
X_test = count_vectorizer.transform(X_test)

In [None]:
from sklearn.model_selection import train_test_split
#from sklearn import preprocessing
#lab_enc = preprocessing.LabelEncoder()
#lab_enc.fit(data['gender'])
#data['gender']=lab_enc.transform(data['gender'])
X_train, X_test, Y_train, Y_test = train_test_split(dat[0],data.label, test_size = 0.3, random_state = 42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
lr=LogisticRegression()
sgd=lr.fit(X_train, Y_train).predict(X_test)
print(accuracy_score(Y_test,sgd))
print(classification_report(Y_test,sgd))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report
print("SVC")
svm = SVC(kernel='linear', C=2.0, random_state=500)
predictionSVM=svm.fit(X_train, Y_train).predict(X_test)
print(accuracy_score(Y_test,predictionSVM))
print(classification_report(Y_test,predictionSVM))

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(max_iter=1100, tol=1e-3)
calibrated_clf = CalibratedClassifierCV(clf, cv=5, method='isotonic')
pred=calibrated_clf.fit(X_train, Y_train).predict(X_test)
print(accuracy_score(Y_test,pred))
print(classification_report(Y_test,pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=2,max_depth=2) 
predictionRF=rfc.fit(X_train, Y_train).predict(X_test)
print(accuracy_score(Y_test,predictionRF))
print(classification_report(Y_test,predictionRF))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
clf2 = SGDClassifier(max_iter=1100, tol=1e-3)
clf1 = LogisticRegression()
eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('sgd', clf2)],voting='hard')
predictionVC=eclf1.fit(X_train, Y_train).predict(X_test)
print(accuracy_score(Y_test,predictionVC))
print(classification_report(Y_test,predictionVC))

In [None]:
#converting twitterSentiment[] list into dataframe for serving it to keras tokenizer
dataSetFinal = pd.DataFrame(np.array(twitterSentiment))
dataSetFinal.head(2)

In [None]:
tokenizer = tensorflow.keras.preprocessing.text.Tokenizer(num_words=1500, lower=True,split=' ',filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(dataSetFinal[0].values)
#print(tokenizer.word_index)  # To see the dicstionary
X = tokenizer.texts_to_sequences(dataSetFinal[0].values)
X = tensorflow.keras.preprocessing.sequence.pad_sequences(X)

In [None]:
from keras import optimizers
#Deep Learning Network Structure
model_conv = Sequential()
model_conv.add(Embedding(1500,100, input_length=X.shape[1]))
model_conv.add(Dropout(0.5))
model_conv.add(Conv1D(64, 5, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=4))
model_conv.add(LSTM(100))
model_conv.add(Dense(2, activation='softmax'))
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model_conv.compile(loss='binary_crossentropy', optimizer='sgd',metrics=['accuracy','mae',keras_metrics.precision(), keras_metrics.recall()])

In [None]:
batch_size=64
Y = pd.get_dummies(data['label']).values
X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.30)
#Here we train the Network.
pred=model_conv.fit(X_train, Y_train, batch_size =batch_size, epochs =10, verbose =2,validation_data=(X_valid,Y_valid))
pred

In [None]:
score=[]
score=model_conv.evaluate(X_valid,Y_valid,verbose=2,batch_size=batch_size)
#keras.metrics.binary_accuracy(Y_valid,pred)
print("score: %.2f" %(score[0]))
print("validation accuracy: %.2f" % (score[1]))
print("recall: %.2f" %(score[4]))
print("Precision: %.2f" % (score[3]))