In [1]:
import pandas as pd
import string
import nltk
from nltk.stem.snowball import EnglishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

#nltk.download('words')

In [2]:
def text_to_raw(str):
    # Remove punctuation
    str = [char for char in str if char not in string.punctuation]
    str = ''.join(str)
    
    # And use stemmer
    str = str.split(' ')
    str = [stemmer.stem(word) for word in str]
    str = ' '.join(str)
    
    return str

def remove_non_english(str):
    str = str.split(' ')
    str = [word for word in str if word not in englishwords]
    str = ' '.join(str)
    
    return str

In [6]:
# Separate movies to 2 files based on overview column
movies = pd.read_csv('movies_metadata.csv', low_memory=False)

movieswithoverview = movies[movies['overview'].notnull()]
movies = movies[movies['overview'].isnull()]

## Write
# movies.to_csv('movies_metadata.csv', index=False)
# movieswithoverview.to_csv('movies_metadata_withoverview.csv', index=False)

In [14]:
# Read and select movies with overview
movieswithoverview = pd.read_csv('movies_metadata_withoverview.csv', low_memory=False)

In [4]:
# Use stemmer
stemmer = EnglishStemmer()
movieswithoverview['overview'] = movieswithoverview['overview'].apply(text_to_raw)

In [5]:
# Get rid of non english text
# This took me at least 1 hour to run!
englishwords = nltk.corpus.words.words()
movieswithoverview['overview'] = movieswithoverview['overview'].apply(remove_non_english)

In [6]:
# Use TF-IDF
tfidf = TfidfVectorizer(lowercase=True, stop_words='english')
vector = tfidf.fit_transform(movieswithoverview['overview'])

In [7]:
# The number of words is reduced to ~57 000
vector

<44506x56946 sparse matrix of type '<class 'numpy.float64'>'
	with 474961 stored elements in Compressed Sparse Row format>

In [8]:
# Dimension reduction
k = 50 # Number of components
svd = TruncatedSVD(n_components=k)
vector = svd.fit_transform(vector)

In [9]:
# Store vector into dataframe and join them
components = pd.DataFrame(vector, columns=[str(i) + '. overview component' for i in range(0,k)])

movieswithcomponents = movieswithoverview.join(components).drop('overview', axis=1)

In [10]:
## Write to file
# movieswithcomponents.to_csv('movies_metadata_withcomponents.csv', index=False)

In [15]:
# Example how the text changed
movieswithoverview['overview'][40000]

'Zhigen, an old Chinese farmer, has lived alone in Beijing for over 20 years after moving to the city to allow his son Chongyi to attend university. He decides to make the long journey from Beijing to Yangshuo to honour the promise he made to his wife to bring back the bird that has been his only companion in the city. His daughter-in-law Qianying, a beautiful rich career woman, asks him to take along his granddaughter Renxing, an only child brought up in the lap of luxury. While grandfather and granddaughter set out on their journey - one travelling back in time, the other discovering her roots - Chongyi and Qianying, ponder the meaning of the life they have led in the sole pursuit of success and money.'

In [13]:
remove_non_english(movieswithoverview['overview'][40000])

'zhigen chines has alon beij 20 citi chongyi univers decid beij yangshuo honour promis has onli citi daughterinlaw qiani granddaught renx onli luxuri grandfath granddaught  discov  chongyi qiani'