In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')

In [2]:
poem=pd.read_csv('all.csv')
poem['label']='poem'
poem.head()

Unnamed: 0,author,content,poem name,age,type,label
0,WILLIAM SHAKESPEARE,Let the bird of loudest lay\r\nOn the sole Ara...,The Phoenix and the Turtle,Renaissance,Mythology & Folklore,poem
1,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,"Sir Charles into my chamber coming in,\r\nWhen...",An Epilogue to the Above,Renaissance,Mythology & Folklore,poem
2,THOMAS BASTARD,"Our vice runs beyond all that old men saw,\r\n...","Book 7, Epigram 42",Renaissance,Mythology & Folklore,poem
3,EDMUND SPENSER,"Lo I the man, whose Muse whilome did maske,\r\...","from The Faerie Queene: Book I, Canto I",Renaissance,Mythology & Folklore,poem
4,RICHARD BARNFIELD,"Long have I longd to see my love againe,\r\nSt...",Sonnet 16,Renaissance,Mythology & Folklore,poem


In [3]:
poem.shape
poem.author.value_counts()

WILLIAM SHAKESPEARE     71
SIR PHILIP SIDNEY       42
JOHN DONNE              41
EDMUND SPENSER          34
WILLIAM BUTLER YEATS    26
                        ..
JOHN FLETCHER            1
ROBERT SOUTHWELL, SJ     1
SIR EDWARD DYER          1
ISABELLA WHITNEY         1
THOMAS HEYWOOD           1
Name: author, Length: 67, dtype: int64

In [4]:
novel=pd.read_csv('booksummaries.txt',delimiter='\t',
    header='infer',
    names=['bookno','path','bookname','author','year','genre','content'])

In [5]:
novel['label']='novel'
novel.head()

Unnamed: 0,bookno,path,bookname,author,year,genre,content,label
0,620,/m/0hhy,Animal Farm,George Orwell,1945-08-17,"{""/m/016lj8"": ""Roman \u00e0 clef"", ""/m/06nbt"":...","Old Major, the old boar on the Manor Farm, ca...",novel
1,843,/m/0k36,A Clockwork Orange,Anthony Burgess,1962,"{""/m/06n90"": ""Science Fiction"", ""/m/0l67h"": ""N...","Alex, a teenager living in near-future Englan...",novel
2,986,/m/0ldx,The Plague,Albert Camus,1947,"{""/m/02m4t"": ""Existentialism"", ""/m/02xlf"": ""Fi...",The text of The Plague is divided into five p...,novel
3,1756,/m/0sww,An Enquiry Concerning Human Understanding,David Hume,,,The argument of the Enquiry proceeds by a ser...,novel
4,2080,/m/0wkt,A Fire Upon the Deep,Vernor Vinge,,"{""/m/03lrw"": ""Hard science fiction"", ""/m/06n90...",The novel posits that space around the Milky ...,novel


In [6]:
novel=novel[novel['author'].notnull()]

In [7]:
novel.shape
novel.author.value_counts()

Agatha Christie            74
Franklin W. Dixon          68
K. A. Applegate            62
Stephen King               60
Edgar Rice Burroughs       59
                           ..
Robert Erskine Childers     1
John Metcalfe               1
Marcel Proust               1
Mo Yan                      1
Shane Dix                   1
Name: author, Length: 4714, dtype: int64

In [8]:
data=pd.merge(poem,novel.iloc[0:1500,:],how='outer')

In [9]:
data.shape

(2073, 11)

In [10]:
# I would want to take only content and label
data=data.loc[:,['content','author','label']]

In [11]:
print(data.shape)
data.head()

(2073, 3)


Unnamed: 0,content,author,label
0,Let the bird of loudest lay\r\nOn the sole Ara...,WILLIAM SHAKESPEARE,poem
1,Let the bird of loudest lay\r\nOn the sole Ara...,WILLIAM SHAKESPEARE,poem
2,"Sir Charles into my chamber coming in,\r\nWhen...",DUCHESS OF NEWCASTLE MARGARET CAVENDISH,poem
3,"Our vice runs beyond all that old men saw,\r\n...",THOMAS BASTARD,poem
4,"Lo I the man, whose Muse whilome did maske,\r\...",EDMUND SPENSER,poem


In [12]:
data.content

0       Let the bird of loudest lay\r\nOn the sole Ara...
1       Let the bird of loudest lay\r\nOn the sole Ara...
2       Sir Charles into my chamber coming in,\r\nWhen...
3       Our vice runs beyond all that old men saw,\r\n...
4       Lo I the man, whose Muse whilome did maske,\r\...
                              ...                        
2068     A deadly virus has swept the world, killing o...
2069     Bertie returns to London from several weeks i...
2070     In the 1960s, young Celia Marsdon is a rich A...
2071     Stig is a caveman. He lives at the bottom of ...
2072     Emil Sinclair is a young boy raised in a bour...
Name: content, Length: 2073, dtype: object

In [13]:
data.isnull().sum()

content    0
author     0
label      0
dtype: int64

In [14]:
import re
def clean_text(text):
    return ' '.join(re.sub("(@,[A-Za-z0-9]+)|([^0-9A-Za-z \t \r \n])|(\w+:\/\/\S+)|([0-9])", " ", text).lower().split())

In [15]:
data['content']=data.content.apply(clean_text)
#data['author']=data.author.apply(clean_text)
    
data.head()

Unnamed: 0,content,author,label
0,let the bird of loudest lay on the sole arabia...,WILLIAM SHAKESPEARE,poem
1,let the bird of loudest lay on the sole arabia...,WILLIAM SHAKESPEARE,poem
2,sir charles into my chamber coming in when i w...,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,poem
3,our vice runs beyond all that old men saw and ...,THOMAS BASTARD,poem
4,lo i the man whose muse whilome did maske as t...,EDMUND SPENSER,poem


In [16]:
data.tail()

Unnamed: 0,content,author,label
2068,a deadly virus has swept the world killing off...,O. T Nelson,novel
2069,bertie returns to london from several weeks in...,P. G. Wodehouse,novel
2070,in the s young celia marsdon is a rich america...,Anya Seton,novel
2071,stig is a caveman he lives at the bottom of th...,Clive King,novel
2072,emil sinclair is a young boy raised in a bourg...,Hermann Hesse,novel


In [17]:
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    word_tokens = word_tokenize(text)
    filtered_text = ' '.join(word for word in word_tokens if word not in stop_words)
    return filtered_text

In [18]:
data['content']=data.content.apply(remove_stopwords)


In [19]:
data.head()

Unnamed: 0,content,author,label
0,let bird loudest lay sole arabian tree herald ...,WILLIAM SHAKESPEARE,poem
1,let bird loudest lay sole arabian tree herald ...,WILLIAM SHAKESPEARE,poem
2,sir charles chamber coming writing fairy queen...,DUCHESS OF NEWCASTLE MARGARET CAVENDISH,poem
3,vice runs beyond old men saw far authentically...,THOMAS BASTARD,poem
4,lo man whose muse whilome maske time taught lo...,EDMUND SPENSER,poem


In [20]:
data.tail()

Unnamed: 0,content,author,label
2068,deadly virus swept world killing everyone age ...,O. T Nelson,novel
2069,bertie returns london several weeks cannes spe...,P. G. Wodehouse,novel
2070,young celia marsdon rich american heiress upon...,Anya Seton,novel
2071,stig caveman lives bottom old chalk pit close ...,Clive King,novel
2072,emil sinclair young boy raised bourgeois home ...,Hermann Hesse,novel


In [21]:
y=data['label']
for i in range(len(y)) :
    if y[i]=='poem':
        y[i]=1
    else:
        y[i]=0


In [22]:
y=y.astype('int')

In [23]:
#x=data.loc[:,['content','author']]
x=data.loc[:,['content']]

In [24]:

x=x['content'].values

In [25]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [26]:
vectorizer = CountVectorizer()
vectorizer.fit(x_train)
X_train = vectorizer.transform(x_train)
X_test = vectorizer.transform(x_test)

In [27]:
classifier = LogisticRegression()
classifier.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
score1 = classifier.score(X_train, y_train)
score = classifier.score(X_test, y_test)
print('Train accuracy:{} \nTest accuracy:{} ' .format((score1*100),(score*100)))

Train accuracy:100.0 
Test accuracy:99.03614457831326 


In [29]:
logisticpredict=classifier.predict(X_test)

In [30]:
print(classification_report(y_test,logisticpredict))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       293
           1       0.98      0.99      0.98       122

    accuracy                           0.99       415
   macro avg       0.99      0.99      0.99       415
weighted avg       0.99      0.99      0.99       415



In [31]:
test="""Detonative top dressing allelic pip adequate mako longish daubentonia madagascariensis europocentric hyperlipidemia uniovulate acer platanoides undocumented semnopithecus entellus unread garonne improved family roccellaceae livid italo calvino inquisitive gunter fabian lower wing pan troglodytes schweinfurthii entitled taklamakan desert familial genus drypis platyrhinian hygiene smoggy bastinado speckless hug drug knobby patient algorithmic repeating decimal exhortative sierra leonean lxxviii recombinant protein transitory sense impression stereotypical implementation flea olympian games bowfront fishplate bibliothecarial boxer deskbound salad bar flavourless meatman lucullan auditory meatus unforethoughtful cucurbita."""

In [34]:
test1="""Above your head no star will flame.

    One weary sound will be the same—

    the hoarse roar of the gale.

    The shadows fall from your tired eyes

    as your lone bedside candle dies,

    for here the calendar breeds nights

    till stores of candles fail"""

In [36]:
test2="""Alex, a teenager living in near-future England, leads his gang on nightly orgies of opportunistic, random "ultra-violence." Alex's friends ("droogs" in the novel's Anglo-Russian slang, Nadsat) are: Dim, a slow-witted bruiser who is the gang's muscle; Georgie, an ambitious second-in-command; and Pete, who mostly plays along as the droogs indulge their taste for ultra-violence. Characterized as a sociopath and a hardened juvenile delinquent, Alex is also intelligent and quick-witted, with sophisticated taste in music, being particularly fond of Beethoven, or "Lovely Ludwig Van." The novel begins with the droogs sitting in their favorite hangout (the Korova Milkbar), drinking milk-drug cocktails, called "milk-plus", to hype themselves for the night's mayhem. They assault a scholar walking home from the public library, rob a store leaving the owner and his wife bloodied and unconscious, stomp a panhandling derelict, then scuffle with a rival gang. Joyriding through the countryside in a stolen car, they break into an isolated cottage and maul the young couple living there, beating the husband and raping his wife."""

In [39]:
def predict(test):
    clean=clean_text(test)
    clean=remove_stopwords(clean)
    clean=np.array([clean])
    t = vectorizer.transform(clean)
    result=classifier.predict(t)
    prob=np.max(classifier.predict_proba(t))
    if prob<0.90:
        print('tag:"NA",confidence:{:.2f}'.format(prob))
    elif result==0:
        print('tag:"novel",confidence:{:.2f}'.format(prob))
    else:
        print('tag:"poem",confidence:{:.2f}'.format(prob))

In [43]:
predict(test)

tag:"NA",confidence:0.67
