In [1]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.stem.snowball import EnglishStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.preprocessing import MultiLabelBinarizer

from scipy.spatial import distance

import time

# nltk.download('words')

In [2]:
def text_to_raw(str):
    # Remove punctuation
    str = [char for char in str if char not in string.punctuation]
    str = ''.join(str)
    
    # And use stemmer
    str = str.split(' ')
    str = [stemmer.stem(word) for word in str]
    str = ' '.join(str)
    
    return str

def remove_non_english(str):
    str = str.split(' ')
    str = [word for word in str if word in englishwords]
    str = ' '.join(str)
    
    return str

def load_glove_model(File):
    glove_model = {}
    with open(File,'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    return glove_model

def map_overview(line):
    overviewvector = np.array([])
    for word in line.split():
        try:
            overviewvector = np.concatenate([overviewvector, glove[word]], axis=0)
        except KeyError:
            pass
    
    if len(overviewvector) == 0:
        return []
    
    # Change data structure to [ [], [], [] ... ] format
    ovec2 = np.zeros(shape=(int(len(overviewvector)/wordlength), wordlength))
    for i in range( int(len(overviewvector)/wordlength) ) :
        ovec2[i] = overviewvector[i * wordlength : i * wordlength + wordlength]
    
    return ovec2

def avg_of_words(line):
    
    if len(line) == 0:
        return []
    
    # Calculate the center of the points
    line = sum(line)/len(line)
    
    return line

Separate movies to 2 files based on overview column

In [5]:
movies = pd.read_csv('movies_metadata.csv', low_memory=False)

movieswithoverview = movies[movies['overview'].notnull()]
movies = movies[movies['overview'].isnull()]

## Write
# movies.to_csv('1a movies_withoutoverview.csv', index=False)
# movieswithoverview.to_csv('1b movies_withoverview.csv', index=False)

1. part: Deal with non-english text

In [21]:
# Read and select movies with overview
movieswithoverview = pd.read_csv('1b movies_withoverview.csv', low_memory=False)

In [4]:
# Use stemmer
stemmer = EnglishStemmer()
movieswithoverview['overview'] = movieswithoverview['overview'].apply(text_to_raw)

In [5]:
# Get rid of non english text
# This took me at least 1 hour to run!
englishwords = nltk.corpus.words.words()
movieswithoverview['overview'] = movieswithoverview['overview'].apply(remove_non_english)

In [10]:
## Write to file
#movieswithoverview.dropna(inplace=True)
#movieswithoverview.to_csv('2 movies_english.csv', index=False)

2. part: TF-IDF

In [11]:
movieswithoverview = pd.read_csv('2 movies_english.csv', low_memory=False)

tfidf = TfidfVectorizer(lowercase=True, stop_words='english')
vector = tfidf.fit_transform(movieswithoverview['overview'])

In [12]:
# The number of words is reduced to ~12 000
vector

<44392x11872 sparse matrix of type '<class 'numpy.float64'>'
	with 719497 stored elements in Compressed Sparse Row format>

In [13]:
# Dimension reduction
k = 50 # Number of components
svd = TruncatedSVD(n_components=k)
vector = svd.fit_transform(vector)

In [14]:
# Store vector into dataframe and join them
components = pd.DataFrame(vector, columns=[str(i) + '. overview component' for i in range(0,k)])

movieswithcomponents = movieswithoverview.join(components).drop('overview', axis=1)

In [15]:
## Write to file
# movieswithcomponents.to_csv('3a movies_TFIDF.csv', index=False)

3. part: GloVe model

In [16]:
movieswithoverview = pd.read_csv('2 movies_english.csv', low_memory=False)

glove = load_glove_model('glove.6B.50d.txt')
wordlength = len(glove[list(glove.keys())[0]])

In [17]:
# Apply GloVe vectors
movieswithoverview['overview'] = movieswithoverview['overview'].apply(map_overview)

In [18]:
# Average (center) of words method
movieswithoverview_avg = movieswithoverview
movieswithoverview_avg['overview'] = movieswithoverview['overview'].apply(avg_of_words)

In [19]:
# Vector to different columns
components = pd.DataFrame(movieswithoverview_avg['overview'].to_list(), columns=[str(col) + '. overview component' for col in range(len(movieswithoverview['overview'][0]))])
components = components.fillna(value=0)

movieswithoverview_avg = movieswithoverview_avg.drop('overview', axis=1).join(components)

In [20]:
## Write to file
# movieswithoverview_avg.to_csv('3b movies_GloVe.csv', index=False)

Example how the text changed

In [22]:
msg = movieswithoverview['overview'][40000]
msg

'Zhigen, an old Chinese farmer, has lived alone in Beijing for over 20 years after moving to the city to allow his son Chongyi to attend university. He decides to make the long journey from Beijing to Yangshuo to honour the promise he made to his wife to bring back the bird that has been his only companion in the city. His daughter-in-law Qianying, a beautiful rich career woman, asks him to take along his granddaughter Renxing, an only child brought up in the lap of luxury. While grandfather and granddaughter set out on their journey - one travelling back in time, the other discovering her roots - Chongyi and Qianying, ponder the meaning of the life they have led in the sole pursuit of success and money.'

In [23]:
stemmer = EnglishStemmer()
englishwords = nltk.corpus.words.words()
msg = text_to_raw(msg)
msg = remove_non_english(msg)
msg

'an old farmer live in for over year after move to the to allow his son to attend he to make the long journey from to to the he made to his wife to bring back the bird that been his companion in the his a beauti rich career woman ask him to take along his an child brought up in the lap of while and set out on their journey one travel back in time the other her root and ponder the mean of the life they have led in the sole pursuit of success and money'