In [2]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.stem.snowball import EnglishStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.preprocessing import MultiLabelBinarizer

# nltk.download('words')

In [3]:
def text_to_raw(str):
    # Remove punctuation
    str = [char for char in str if char not in string.punctuation]
    str = ''.join(str)
    
    # And use stemmer
    str = str.split(' ')
    str = [stemmer.stem(word) for word in str]
    str = ' '.join(str)
    
    return str

def remove_non_english(str):
    str = str.split(' ')
    str = [word for word in str if word not in englishwords]
    str = ' '.join(str)
    
    return str

def load_glove_model(File):
    glove_model = {}
    with open(File,'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    return glove_model

def map_overview(line):
    overviewvector = np.array([])
    for word in line.split():
        try:
            overviewvector = np.concatenate([overviewvector, glove[word]], axis=0)
        except KeyError:
            pass
    
    if len(overviewvector) == 0:
        return []
    
    # Change data structure to [ [], [], [] ... ] format
    ovec2 = np.zeros(shape=(int(len(overviewvector)/wordlength), wordlength))
    for i in range( int(len(overviewvector)/wordlength) ) :
        ovec2[i] = overviewvector[i * wordlength : i * wordlength + wordlength]
        
    # Calculate the center of the points
    ovec2 = sum(ovec2)/len(ovec2)
    
    return ovec2

1. part: Deal with non-english text

In [5]:
# Separate movies to 2 files based on overview column
movies = pd.read_csv('movies_metadata.csv', low_memory=False)

movieswithoverview = movies[movies['overview'].notnull()]
movies = movies[movies['overview'].isnull()]

## Write
# movies.to_csv('1a movies_withoutoverview.csv', index=False)
# movieswithoverview.to_csv('1b movies_withoverview.csv', index=False)

In [6]:
# Read and select movies with overview
movieswithoverview = pd.read_csv('1b movies_withoverview.csv', low_memory=False)

In [7]:
# Use stemmer
stemmer = EnglishStemmer()
movieswithoverview['overview'] = movieswithoverview['overview'].apply(text_to_raw)

In [8]:
# Get rid of non english text
# This took me at least 1 hour to run!
englishwords = nltk.corpus.words.words()
movieswithoverview['overview'] = movieswithoverview['overview'].apply(remove_non_english)

In [13]:
## Write to file
# movieswithoverview.dropna(inplace=True)
# movieswithoverview.to_csv('2 movies_english.csv', index=False)

2. part: TF-IDF

In [14]:
movieswithoverview = pd.read_csv('2 movies_english.csv', low_memory=False)

tfidf = TfidfVectorizer(lowercase=True, stop_words='english')
vector = tfidf.fit_transform(movieswithoverview['overview'])

In [12]:
# The number of words is reduced to ~57 000
vector

<44062x56883 sparse matrix of type '<class 'numpy.float64'>'
	with 474015 stored elements in Compressed Sparse Row format>

In [9]:
# Dimension reduction
k = 50 # Number of components
svd = TruncatedSVD(n_components=k)
vector = svd.fit_transform(vector)

In [10]:
# Store vector into dataframe and join them
components = pd.DataFrame(vector, columns=[str(i) + '. overview component' for i in range(0,k)])

movieswithcomponents = movieswithoverview.join(components).drop('overview', axis=1)

In [11]:
## Write to file
movieswithcomponents.to_csv('3a movies_TFIDF.csv', index=False)

3. part: Embedded Layers

In [16]:
movieswithoverview = pd.read_csv('2 movies_english.csv', low_memory=False)

glove = load_glove_model('glove.6B.50d.txt')
wordlength = len(glove[list(glove.keys())[0]])

In [17]:
# Embedded Layers
movieswithoverview['overview'] = movieswithoverview['overview'].apply(map_overview)

In [18]:
components = pd.DataFrame(movieswithoverview['overview'].to_list(), columns=[str(col) + '. overview component' for col in range(len(movieswithoverview['overview'][0]))])
components = components.fillna(value=0)

movieswithoverview = movieswithoverview.drop('overview', axis=1).join(components)

In [19]:
## Write to file
# movieswithoverview.to_csv('3b movies_Embedded.csv', index=False)

Example how the text changed

In [15]:
movieswithoverview['overview'][40000]

'Zhigen, an old Chinese farmer, has lived alone in Beijing for over 20 years after moving to the city to allow his son Chongyi to attend university. He decides to make the long journey from Beijing to Yangshuo to honour the promise he made to his wife to bring back the bird that has been his only companion in the city. His daughter-in-law Qianying, a beautiful rich career woman, asks him to take along his granddaughter Renxing, an only child brought up in the lap of luxury. While grandfather and granddaughter set out on their journey - one travelling back in time, the other discovering her roots - Chongyi and Qianying, ponder the meaning of the life they have led in the sole pursuit of success and money.'

In [13]:
remove_non_english(movieswithoverview['overview'][40000])

'zhigen chines has alon beij 20 citi chongyi univers decid beij yangshuo honour promis has onli citi daughterinlaw qiani granddaught renx onli luxuri grandfath granddaught  discov  chongyi qiani'