In [334]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics import euclidean_distances
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn import preprocessing


In [335]:
def decontract(sentence):
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    return sentence

def removePunctuation(sentence): 
    sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
    sentence = sentence.strip()
    sentence = sentence.replace("\n"," ")
    return sentence

def removeNumber(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', '', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def stemming(sentence):
    stemmer = SnowballStemmer("english")
    stemmedSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemmedSentence += stem
        stemmedSentence += " "
    stemmedSentence = stemmedSentence.strip()
    return stemmedSentence

def clear(string):
    string = decontract(string)
    string = removePunctuation(string)
    string = removeNumber(string)
    string = stemming(string)
    return string

clear("sfsaf as fa mfw,;;wmwf mdf,w341. 22333 1 a d g")

'sfsaf as fa mfw wmwf mdf w a d g'

In [339]:
train_df

Unnamed: 0,id,movie,dialogue,genres
0,0,0,i thought you were in a meet br i am with you,"[drama, romance]"
1,1,1,are you sure you are okay you are pale br i fe...,[drama]
2,2,2,go on get out br mom look do not say anyth fir...,[comedy]
3,3,3,i could have lost my fuck hand br that would h...,"[mystery, thriller]"
4,4,4,stick with me on this gloria i need you br and...,"[crime, thriller]"
...,...,...,...,...
36986,36986,246,there is a man downstair he brought us egg he ...,"[drama, war]"
36987,36987,43,hi br i would prefer it if you did not speak t...,"[comedy, drama]"
36988,36988,459,i tri to call you i am run a littl late this i...,[drama]
36989,36989,174,what are you crazi br i just thought we should...,"[drama, romance]"


In [338]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.dialogue = train_df.dialogue.apply(lambda x: clear(x))
test_df.dialogue = test_df.dialogue.apply(lambda x: clear(x))
# genres = (
#     {
#      'drama': 0,
#      'romance': 1,
#      'comedy': 2,
#      'mystery': 3,
#      'thriller': 4,
#      'crime': 5,
#      'action': 6,
#      'fantasy': 7,
#      'war': 8,
#      'sci-fi': 9,
#      'horror': 10,
#      'sport': 11,
#      'adventure': 12,
#      'music': 13,
#      'biography': 14,
#      'history': 15,
#      'western': 16,
#      'musical': 17,
#      'family': 18,
#      'animation': 19
#     }
# )


def convert_to_multilabels(x):
    result = [0] * len(genres.keys())
    newRes = []
    x = x.replace("[", "").replace("]", "").replace(",", "").replace("u'", "").replace("'", "")
    x = x.split()
#     for el in x:
#         index = genres[el]
#         result[index] = 1
    return x
    
train_df.genres = train_df.genres.apply(lambda x: convert_to_multilabels(x))

X_train, y_train = train_df[['id', 'dialogue']], train_df.genres
X_test = test_df[['id', 'dialogue']]
multiLabelBinarizer = MultiLabelBinarizer()
y_train = multiLabelBinarizer.fit_transform(y_train)

In [352]:
def f(x):
    res = []
    for z in x:
        res.append(z)
    if len(res) == 0:
        return "drama"
    return " ".join(res)


class MyClassifier(BaseEstimator, ClassifierMixin):
    
    def __init__(self, multiLabelBinarizer):
        self.X_train_tfidf = None
        self.X_train_counts = None
        self.clf = None
        self.count_vect = CountVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3))
        self.tfidf_transformer = TfidfTransformer()
        self.multiLabelBinarizer = multiLabelBinarizer
        
    def fit(self, X, y):
        self.X_train_counts = self.count_vect.fit_transform(X.dialogue)
        self.X_train_tfidf = self.tfidf_transformer.fit_transform(self.X_train_counts)
        self.X_train_tfidf = preprocessing.scale(self.X_train_tfidf, with_mean=False)
        self.clf = MultiOutputClassifier(LogisticRegression(multi_class='ovr', penalty='elasticnet', solver='saga', tol=0.01)).fit(self.X_train_tfidf, y)
        
    def predict(self, X):
        X_new_counts = self.count_vect.transform(X.dialogue)
        X_new_tfidf = self.tfidf_transformer.transform(X_new_counts)
        X_new_tfidf = preprocessing.scale(X_new_tfidf, with_mean=False)
        predicted = self.clf.predict(X_new_tfidf)
        
        out = X[['id']]
        out['genres'] = self.multiLabelBinarizer.inverse_transform(predicted)
        out.genres = out.genres.apply(lambda x: f(x))
        out.to_csv('result.csv', index=False)
        self.result_dataframe = out
        return predicted
       

In [None]:
# myclf = MultiOutputClassifier(MyClassifier())
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X_train.dialogue)
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# myclf = MultiOutputClassifier(MultinomialNB()).fit(X_train_tfidf, y_train)


X_test
# X_train
cld = MyClassifier(multiLabelBinarizer=multiLabelBinarizer)
cld.fit(X_train, y_train)
cld.predict(X_test)


# scores = cross_val_score(cld, X_train, y_train, cv = 3, scoring='f1_samples')
# print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [328]:
stopwords.

<WordListCorpusReader in '.../corpora/stopwords' (not loaded yet)>

In [290]:
for t in (1,6,7):
    print(t)

1
6
7


In [140]:
X_new_counts = count_vect.fit_transform(X_test.dialogue)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

ValueError: Input has n_features=17929 while the model has been trained with n_features=35707

In [285]:
predicted



array(["[u'drama']", "[u'drama']", "[u'drama']", ..., "[u'drama']",
       "[u'drama']", "[u'drama']"], dtype='<U81')

In [95]:
print(X_train.iloc[0:4, :])
print(y_train.head(6))
print(count_vect.vocabulary_.get(u'sure'))

   id                                           dialogue
0   0  I thought you were in a meeting--? <BR> I am. ...
1   1  Are you sure you're okay?  You're pale. <BR> I...
2   2  Go on! Get out! <BR> Mom look don't say anythi...
3   3  I could have lost my fucking hands. <BR> That ...
0       [u'drama', u'romance']
1                   [u'drama']
2                  [u'comedy']
3    [u'mystery', u'thriller']
4      [u'crime', u'thriller']
5      [u'comedy', u'romance']
Name: genres, dtype: object
31394


In [66]:
count_vect.vocabulary

In [88]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [92]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [94]:
twenty_train.target

array([1, 1, 3, ..., 2, 2, 2])

In [153]:
train_df

Unnamed: 0,id,movie,dialogue,genres
0,0,0,I thought you were in a meeting--? <BR> I am. ...,"[u'drama', u'romance']"
1,1,1,Are you sure you're okay? You're pale. <BR> I...,[u'drama']
2,2,2,Go on! Get out! <BR> Mom look don't say anythi...,[u'comedy']
3,3,3,I could have lost my fucking hands. <BR> That ...,"[u'mystery', u'thriller']"
4,4,4,Stick with me on this Gloria. I need you... <...,"[u'crime', u'thriller']"
...,...,...,...,...
36986,36986,246,There's a man downstairs. He brought us eggs....,"[u'drama', u'war']"
36987,36987,43,Hi. <BR> I'd prefer it if you didn't speak to ...,"[u'comedy', u'drama']"
36988,36988,459,I tried to call you I'm running a little late ...,[u'drama']
36989,36989,174,What are you crazy? <BR> I just thought we sho...,"[u'drama', u'romance']"


In [221]:
z = CountVectorizer()
z.fit(train_df.genres)
z.vocabulary_

0           [drama, romance]
1                    [drama]
2                   [comedy]
3        [mystery, thriller]
4          [crime, thriller]
                ...         
36986           [drama, war]
36987        [comedy, drama]
36988                [drama]
36989       [drama, romance]
36990         [crime, drama]
Name: genres, Length: 36991, dtype: object

In [200]:
train_df

Unnamed: 0,id,movie,dialogue,genres
0,0,0,I thought you were in a meeting--? <BR> I am. ...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,1,Are you sure you're okay? You're pale. <BR> I...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,2,Go on! Get out! <BR> Mom look don't say anythi...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,3,I could have lost my fucking hands. <BR> That ...,"[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,4,Stick with me on this Gloria. I need you... <...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
36986,36986,246,There's a man downstairs. He brought us eggs....,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
36987,36987,43,Hi. <BR> I'd prefer it if you didn't speak to ...,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
36988,36988,459,I tried to call you I'm running a little late ...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
36989,36989,174,What are you crazy? <BR> I just thought we sho...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])