In [None]:
import re, nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.svm import LinearSVC
import joblib


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Reading dataset as dataframe
df = pd.read_csv('/content/drive/MyDrive/Machine Lerning/CA2_Task_2/Poem_Data.csv')

pd.set_option('display.max_colwidth', None) # Setting this so we can see the full content of cells
pd.set_option('display.max_columns', None) # to make sure we can see all the columns in output window
print(df)
# Cleaning Tweets
def cleaner(tweet):
    soup = BeautifulSoup(tweet, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'
    souped = soup.get_text()
    re1 = re.sub(r"(@|http://|https://|www|\\x)\S*", " ", souped) # substituting @mentions, urls, etc with whitespace
    re2 = re.sub("[^A-Za-z]+"," ", re1) # substituting any non-alphabetic character that repeats one or more times with whitespace

    tokens = nltk.word_tokenize(re2)
    lower_case = [t.lower() for t in tokens]

    stop_words = set(stopwords.words('english'))
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))

    wordnet_lemmatizer = WordNetLemmatizer()
    lemmas = [wordnet_lemmatizer.lemmatize(t,'v') for t in filtered_result]
    return lemmas


           Genre  \
0          Music   
1          Music   
2          Music   
3          Music   
4          Music   
..           ...   
836  Environment   
837  Environment   
838  Environment   
839  Environment   
840  Environment   

                                                                                                                                                                                                                                                                                                                                                            Poem  
0                                                                                                                                                                                                                                                                                                                                                            NaN  
1                                                

In [None]:
# Convert float values to empty string in the 'Poem' column
df['Poem'] = df['Poem'].astype(str)
df['cleaned_tweet'] = df.Poem.apply(cleaner)

df = df[df['cleaned_tweet'].map(len) > 0] # removing rows with cleaned tweets of length 0

print("Printing top 5 rows of dataframe showing original and cleaned tweets....")

print(df[['Poem','cleaned_tweet']].head())

df.drop(['Poem'], axis=1, inplace=True)

# Saving cleaned tweets to csv
df.to_csv('cleaned_data.csv', index=False)

# joining tokens to create strings. TfidfVectorizer does not accept tokens as input
df['cleaned_tweet'] = [" ".join(row) for row in df['cleaned_tweet'].values]

data = df['cleaned_tweet']

Y = df['Genre'] # target column

# min_df=.00015 means that each ngram (unigram, bigram, & trigram) must be present in at least 30 documents for it to be considered as a token (200000*.00015=30). This is a clever way of feature engineering
tfidf = TfidfVectorizer(min_df=.00015, ngram_range=(1,3))

tfidf.fit(data) # learn vocabulary of entire data

data_tfidf = tfidf.transform(data) # creating tfidf values

pd.DataFrame(pd.Series(tfidf.get_feature_names_out())).to_csv('vocabulary.csv', header=False, index=False)

print("Shape of tfidf matrix: ", data_tfidf.shape)

  soup = BeautifulSoup(tweet, 'lxml') # removing HTML entities such as ‘&amp’,’&quot’,'&gt'; lxml is the html parser and shoulp be installed using 'pip install lxml'


Printing top 5 rows of dataframe showing original and cleaned tweets....
                                                                                                                                                                                                                                                 Poem  \
0                                                                                                                                                                                                                                                 nan   
1                                                     In the thick brushthey spend the hottest part of the day,              soaking their hoovesin the trickle of mountain water              the ravine hoardson behalf of the oleander.              
2                                                        Storms are generous.                                      Something so easy to surrender to, sitting by the window, and the

In [None]:
# Implementing Support Vector Classifier
model = LinearSVC() # kernel = 'linear' and C = 1


In [None]:
# Running cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) # 10-fold cross-validation
scores=[]
iteration = 0
for train_index, test_index in kf.split(data_tfidf, Y):
    iteration += 1
    print("Iteration ", iteration)
    X_train, Y_train = data_tfidf[train_index], Y[train_index]
    X_test, Y_test = data_tfidf[test_index], Y[test_index]
    model.fit(X_train, Y_train) # Fitting SVC
    Y_pred = model.predict(X_test)
    score = metrics.accuracy_score(Y_test, Y_pred) # Calculating accuracy
    print("Cross-validation accuracy: ", score)
    scores.append(score) # appending cross-validation accuracy for each iteration
mean_accuracy = np.mean(scores)
print("Mean cross-validation accuracy: ", mean_accuracy)

Iteration  1
Cross-validation accuracy:  0.4823529411764706
Iteration  2
Cross-validation accuracy:  0.40476190476190477
Iteration  3
Cross-validation accuracy:  0.4523809523809524
Iteration  4
Cross-validation accuracy:  0.36904761904761907
Iteration  5
Cross-validation accuracy:  0.39285714285714285
Iteration  6
Cross-validation accuracy:  0.5238095238095238
Iteration  7
Cross-validation accuracy:  0.44047619047619047
Iteration  8
Cross-validation accuracy:  0.4166666666666667
Iteration  9
Cross-validation accuracy:  0.5238095238095238
Iteration  10
Cross-validation accuracy:  0.4523809523809524
Mean cross-validation accuracy:  0.4458543417366947
