In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from nltk.stem.porter import PorterStemmer
import re

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [2]:
# Import the dataset
dataset = pd.read_csv('all.csv')
dataset

Unnamed: 0,author,content,poem name,age,type
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore
...,...,...,...,...,...
568,SARA TEASDALE,"With the man I love who loves me not,\r\nI wal...",Union Square,Modern,Love
569,HART CRANE,"Hart Crane, ""Voyages I, II, III, IV, V, VI"" fr...",Voyages,Modern,Love
570,WILLIAM BUTLER YEATS,"When you are old and grey and full of sleep,\r...",When You Are Old,Modern,Love
571,CARL SANDBURG,"Give me hunger,\r\nO you gods that sit and giv...",At a Window,Modern,Love


In [3]:
# Create independant and dependant variables
X = dataset.iloc[:, [1]]
y = dataset.iloc[:, [-1]]

In [4]:
X.iloc[0,0]

'Let the bird of loudest lay\r\nOn the sole Arabian tree\r\nHerald sad and trumpet be,\r\nTo whose sound chaste wings obey.\r\n\r\nBut thou shrieking harbinger,\r\nFoul precurrer of the fiend,\r\nAugur of the fever\'s end,\r\nTo this troop come thou not near.\r\n\r\nFrom this session interdict\r\nEvery fowl of tyrant wing,\r\nSave the eagle, feather\'d king;\r\nKeep the obsequy so strict.\r\n\r\nLet the priest in surplice white,\r\nThat defunctive music can,\r\nBe the death-divining swan,\r\nLest the requiem lack his right.\r\n\r\nAnd thou treble-dated crow,\r\nThat thy sable gender mak\'st\r\nWith the breath thou giv\'st and tak\'st,\r\n\'Mongst our mourners shalt thou go.\r\n\r\nHere the anthem doth commence:\r\nLove and constancy is dead;\r\nPhoenix and the Turtle fled\r\nIn a mutual flame from hence.\r\n\r\nSo they lov\'d, as love in twain\r\nHad the essence but in one;\r\nTwo distincts, division none:\r\nNumber there in love was slain.\r\n\r\nHearts remote, yet not asunder;\r\nDis

In [5]:
# Cleaning the text
corpus = []

ps = PorterStemmer()

def stem(word):
    return ps.stem(word)

for i in range(len(dataset)):
    text = X.iloc[i,0].lower()
    text = re.sub('\\n', ' ', text)
    text = re.sub('\\r', ' ', text)
    text = re.sub('[\\\/\'\"_\-,:;.\(\)\{\}\?]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub('\s[a-z]\s', ' ', text)
    text = nltk.word_tokenize(text)
    text = list(map(stem, text))
    text = ' '.join(text)
    corpus.append(text)

In [6]:
# Vectorize text using TF-IDF model
cv = TfidfVectorizer(max_features=1000,
                     stop_words='english',
                     max_df = 0.6,
                     min_df = 3)

X = cv.fit_transform(corpus).toarray()

In [8]:
# Creating dictionary of different models and parameters
model_params = {
    'svm': {
        'model': SVC(),
        'params': {
            'C': [0.25, 0.5, 0.75 ,1],
            'kernel': ['rbf','linear'],
            'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 40, 100]
        }
    },
    'naive_bayes': {
        'model': MultinomialNB(),
        'params': {
            'alpha':[0, 0.5, 1.0]
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_features': ['auto', 'sqrt', 'log2']
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(multi_class='auto'),
        'params': {
            'C': [0.25, 0.5, 0.75, 1, 5, 10],
            'solver': ['liblinear', 'lbfgs', 'sag', 'saga']
        }
    },
    
}

In [9]:
# GridSearchCV model
scores = []
for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, n_jobs = -1)
    clf.fit(X, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
df = pd.DataFrame(scores)

print(df)

  return f(*args, **kwargs)
  self.best_estimator_.fit(X, y, **fit_params)
  return f(*args, **kwargs)


                 model  best_score  \
0                  svm    0.638673   
1        random_forest    0.607185   
2          naive_bayes    0.633364   
3        decision_tree    0.528711   
4  logistic_regression    0.636903   

                                        best_params  
0     {'C': 0.75, 'gamma': 0.1, 'kernel': 'linear'}  
1                              {'n_estimators': 40}  
2                                    {'alpha': 0.5}  
3  {'criterion': 'entropy', 'max_features': 'auto'}  
4                   {'C': 5, 'solver': 'liblinear'}  


  return f(*args, **kwargs)
