# Question 1

In [346]:
import pandas as pd
import numpy as np

### Loading the data

In [347]:
df = pd.read_csv('spam.csv', encoding='latin1')
df = df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
df.rename(columns={'v1' : 'Label', 'v2' : 'Message'}, inplace=True)
df['Label'] = df['Label'].apply(lambda x : 1 if (x == 'ham')  else 0)  # if 1 it is ham and if 0 it is spam
df

Unnamed: 0,Label,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will Ì_ b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


### preprocessing the text

In [348]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text) :
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = [stemmer.stem(word) for word in text.split() if word not in stop_words]  # Remove stop words and stem

    return text

messages = [clean_text(message) for message in df['Message'].values]

### word2vec model

In [349]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(sentences= messages, vector_size=100, window=5, min_count=2, workers=4)

### sentence embeddings

In [356]:
def avg_sentence_embedding(tokens) :
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis = 0) if vectors else np.zeros(w2v_model.vector_size)

sentence_vectors = np.array([avg_sentence_embedding(message_tokens) for message_tokens in messages])
print(sentence_vectors.shape)
labels = df['Label'].to_numpy()
print(labels.shape)

(5572, 100)
(5572,)


### splitting the data 

In [352]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train, y_test = train_test_split(sentence_vectors, labels, test_size=0.2)
print(x_train.shape, x_test.shape)

(4457, 100) (1115, 100)


### logistic regression

In [353]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

classifier = LogisticRegression()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

accuracy = accuracy_score(y_pred, y_test)
print("Acurracy :", accuracy)

Acurracy : 0.8690582959641255


### function definition

In [355]:
from gensim.utils import simple_preprocess
dict1 = { 1 : 'ham', 0 : 'spam'}

def predict_message_class(model, w2v_model, message) :
    message_tokens = simple_preprocess(message)
    word_vectors = [w2v_model.wv[token] for token in message_tokens if token in w2v_model.wv]
    sentence_vector = np.mean(word_vectors, axis=0) if word_vectors else np.zeros(w2v_model.vector_size)
    val =  model.predict(sentence_vector.reshape(1,-1))
    return dict1[val[0]]


text = df['Message'][0]
label = df['Label'][0]
cls = predict_message_class(classifier, w2v_model, text)
print("message :", text)
print("actual label :", 'ham' if label == 1 else 'spam')
print("predicted label :", cls)

message : Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
actual label : ham
predicted label : ham


# Question 2

### loading data

In [337]:
import numpy as np
import pandas as pd

data = pd.read_csv('Tweets.csv')
data = data[['text', 'airline_sentiment']]
data

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
...,...,...
14635,@AmericanAir thank you we got on a different f...,positive
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative
14637,@AmericanAir Please bring American Airlines to...,neutral
14638,"@AmericanAir you have my money, you change my ...",negative


In [338]:
text = data['text']
print(len(text))
print(text[:5])

14640
0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
4    @VirginAmerica and it's a really big bad thing...
Name: text, dtype: object


### encoding the label

In [339]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
label = data['airline_sentiment']
print(label)
label = encoder.fit_transform(label)     #  0 : negative, 1 : neutral, 2 : positive
label = np.array(label)
print(label)
print(label.shape)

0         neutral
1        positive
2         neutral
3        negative
4        negative
           ...   
14635    positive
14636    negative
14637     neutral
14638    negative
14639     neutral
Name: airline_sentiment, Length: 14640, dtype: object
[1 2 1 ... 1 0 1]
(14640,)


### preprocessing the text

In [340]:
import re
import contractions
import emoji
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def pre_processing(text) :
    text = text.lower() 
    text = contractions.fix(text)   # expand the contractions
    text = re.sub(r'https?:\/\/\S+|www\.\S+', '', text)   # to remove urls 
    text = re.sub(r'@\w+', '', text)   # to remove mentions
    text = re.sub(r'#', '', text)   # to remove hashtags keep the word
    text = emoji.demojize(text)  # Convert emojis to text
    text = re.sub(r'\s+', ' ', text)   # to remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)   # to remove special characters and punctuation

    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]  # lemmatize and remove stop words

    return text

text_final = text.apply(lambda x : pre_processing(x))
print(text_final[:5])

0                                               [said]
1         [plus, added, commercial, experience, tacky]
2       [today, must, mean, need, take, another, trip]
3    [really, aggressive, blast, obnoxious, enterta...
4                            [really, big, bad, thing]
Name: text, dtype: object


### loading the model

In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [303]:
glove_model = api.load('glove-twitter-100')



### sentence embeddings

In [342]:
def sentence_embeddings(text_tokens, wv) :
    word_vectors = [wv[token] for token in text_tokens if token in wv]
    return np.mean(word_vectors, axis= 0 ) if word_vectors else np.zeros(wv.vector_size)

sentence_vectors_word2vec = np.array([sentence_embeddings(text_tokens, wv) for text_tokens in text_final])
print(sentence_vectors_word2vec.shape)

sentence_vectors_glove = np.array([sentence_embeddings(text_tokens, glove_model) for text_tokens in text_final])
print(sentence_vectors_glove.shape)

(14640, 300)
(14640, 100)


### logistic regression

#### using google news word2vec model

In [343]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
x_train_1, x_test_1, y_train, y_test = train_test_split(sentence_vectors_word2vec, label, test_size=0.2)
classifier1 = LogisticRegression()
classifier1.fit(x_train_1, y_train)
y_pred_1 = classifier1.predict(x_test_1)

accuracy1 = accuracy_score(y_pred_1, y_test)
print('Accuracy with word2vec:', accuracy1)

Accuracy with word2vec: 0.7650273224043715


#### using glove twitter model

In [344]:
x_train_2, x_test_2, y_train, y_test = train_test_split(sentence_vectors_glove, label, test_size=0.2)
classifier2 = LogisticRegression()
classifier2.fit(x_train_2, y_train)
y_pred_2 = classifier2.predict(x_test_2)

accuracy2 = accuracy_score(y_pred_2, y_test)
print('Accuracy with glove :', accuracy2)

Accuracy with glove : 0.7595628415300546


### function definition

In [357]:
dict2 = {0 : 'negative', 1 : 'neutral', 2 : 'positive'}

def predict_tweet_sentiment(model, glove_model, tweet) :
    tweet_process = pre_processing(tweet)
    sentence_vector = sentence_embeddings(tweet_process, glove_model)
    val =  model.predict(sentence_vector.reshape(1,-1))
    return dict2[val[0]]


text = data['text'][5]
print("tweet :", text)
print("actual label :" , data['airline_sentiment'][5])
print("predicted label using word2vec:", predict_tweet_sentiment(classifier1, wv, text))
print("predicted label using glove :", predict_tweet_sentiment(classifier2, glove_model, text))


tweet : @VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA
actual label : negative
predicted label using word2vec: negative
predicted label using glove : negative
