In [30]:


# General
import numpy as np
import pandas as pd
import re
import nltk

# Data Exploration

import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud
from collections import Counter
     


In [31]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.metrics import average_precision_score,roc_auc_score, roc_curve, precision_recall_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

np.random.seed(42)

In [32]:
df = pd.read_csv("final_ka_final.csv");

In [5]:
df.isnull().sum()

Unnamed: 0.2            0
movie_id                0
actor_1_gender          0
actor_2_gender          0
actor_1_name            0
actor_2_name            0
director_gender         0
director_name           0
Unnamed: 0.1            0
Unnamed: 0              0
adult                   0
budget                  0
genres                  0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
title                   0
vote_average            0
vote_count              0
dtype: int64

In [34]:


# temp = pd.DataFrame()
# temp['Label'] = ['Real Disaster','No Disaster']
# temp['Count'] = [len(df[df['overview'] == 1]), len(df[df['target'] == 0])]
# temp = temp.sort_values(by = ['Count'], ascending = False)
# fig = px.bar(temp, x = 'Label', y = 'Count', 
#              color = "Label", text_auto='', width = 600, 
#              color_discrete_sequence = ["#AB63FA", "orange"],
#              template = 'plotly_dark',
#             title = "Count Plot")

# fig.update_xaxes(showgrid = False)
# fig.update_yaxes(showgrid = False)
# fig.update_traces(textfont_size=12, textangle = 0, textposition = "outside", cliponaxis = False)
# fig.show()
     


In [36]:
df['overview'] = df['overview'].astype('str')

In [49]:
df.dtypes

Unnamed: 0.2              int64
movie_id                  int64
actor_1_gender            int64
actor_2_gender            int64
actor_1_name             object
actor_2_name             object
director_gender           int64
director_name            object
Unnamed: 0.1              int64
Unnamed: 0               object
adult                      bool
budget                    int64
genres                   object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                 float64
runtime                 float64
spoken_languages         object
title                    object
vote_average            float64
vote_count              float64
clean_text               object
dtype: object

In [39]:


#convert to lowercase and remove punctuations and characters and then strip
def preprocess(text):
    text = text.lower() #lowercase text
    text=text.strip()  #get rid of leading/trailing whitespace 
    text=re.compile('<.*?>').sub('', text) #Remove HTML tags/markups
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #Replace punctuation with space. Careful since punctuation can sometime be useful
    text = re.sub('\s+', ' ', text)  #Remove extra space and tabs
    text = re.sub(r'',' ',text) #[0-9] matches any digit (0 to 10000...)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ',text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace 
    
    return text

In [40]:
#1. STOPWORD REMOVAL
from nltk.corpus import stopwords
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

#2. STEMMING
 
# Initialize the stemmer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
snow = SnowballStemmer('english')
def stemming(string):
    a=[snow.stem(i) for i in word_tokenize(string) ]
    return " ".join(a)

#3. LEMMATIZATION
# Initialize the lemmatizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
wl = WordNetLemmatizer()
 
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [41]:

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
     

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pratham/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/pratham/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pratham/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /home/pratham/nltk_data...


True

In [44]:
#FINAL PREPROCESSING
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

df['clean_text'] = df['overview'].apply(lambda x: finalpreprocess(x))
df.head()
     

Unnamed: 0.3,Unnamed: 0.2,movie_id,actor_1_gender,actor_2_gender,actor_1_name,actor_2_name,director_gender,director_name,Unnamed: 0.1,Unnamed: 0,...,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,title,vote_average,vote_count,clean_text
0,0,862,2,2,Tom Hanks,Tim Allen,2,John Lasseter,0,Animation,...,Pixar Animation Studios,United States of America,1995-10-30,373554033.0,81.0,English,Toy Story,7.7,5415.0,l e b w n l v e h p p l n h r u n l n b r h b ...
1,1,8844,2,2,Robin Williams,Jonathan Hyde,2,Joe Johnston,1,Adventure,...,TriStar Pictures,United States of America,1995-12-15,262797249.0,104.0,English,Jumanji,6.9,2413.0,w h e n b l n g j u n p e e r c v e r n e n c ...
2,3,31357,1,1,Whitney Houston,Angela Bassett,2,Forest Whitaker,3,Comedy,...,Twentieth Century Fox Film Corporation,United States of America,1995-12-22,81452156.0,127.0,English,Waiting to Exhale,6.1,34.0,c h e e n r e e n e p p e n h e w e n r e h l ...
3,4,11862,2,1,Steve Martin,Diane Keaton,2,Charles Shyer,4,Comedy,...,Sandollar Productions,United States of America,1995-02-10,76578911.0,106.0,English,Father of the Bride Part II,5.7,173.0,j u w h e n g e r g e b n k h r e c v e r e f ...
4,5,949,2,2,Al Pacino,Robert De Niro,2,Michael Mann,5,Action,...,Regency Enterprises,United States of America,1995-12-15,187436818.0,170.0,English,Heat,7.7,1886.0,b e v e e r h e f n e l c c u l e l e p n c h ...


In [69]:
df.dtypes

Unnamed: 0.2              int64
movie_id                  int64
actor_1_gender            int64
actor_2_gender            int64
actor_1_name             object
actor_2_name             object
director_gender           int64
director_name            object
Unnamed: 0.1              int64
Unnamed: 0               object
adult                      bool
budget                    int64
genres                   object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                 float64
runtime                 float64
spoken_languages         object
title                    object
vote_average            float64
vote_count              float64
clean_text               object
dtype: object

In [73]:
df['overview'] = df['overview'].astype('|S')

UnicodeEncodeError: 'ascii' codec can't encode character '\u2013' in position 34: ordinal not in range(128)

In [72]:
df['overview'] = df['overview'].r
eplace(u"\u2013", "-")

In [66]:
df1 = str(df['overview']).encode('utf-8')