In [4]:
# import csv

# def txt_to_csv(input_file, output_file):
#     with open(input_file, 'r') as txtfile, open(output_file, 'w', newline='') as csvfile:
#         csv_writer = csv.writer(csvfile)
#         for line in txtfile:
#             # Dividir la l√≠nea utilizando ":::" como separador
#             parts = line.strip().split(":::")
#             # Escribir las partes en el archivo CSV
#             csv_writer.writerow(parts)
   

# txt_to_csv('../data/Movies/train_data.txt', '../data/Movies/train_data.csv')
# txt_to_csv('../data/Movies/test_data.txt', '../data/Movies/test_data.csv')
# txt_to_csv('../data/Movies/test_data_solution.txt', '../data/Movies/test_data_solution.csv')


In [1]:
import pandas as pd

train_df = pd.read_csv('../data/Movies/train_data.csv', header=None)
test_df = pd.read_csv('../data/Movies/test_data.csv', header=None)
# Eliminar primera columna
train_df = train_df.drop(columns=[0])
test_df = test_df.drop(columns=[0])
# Asignar nombres a las columnas: 'title', 'genre', 'plot'
train_df.columns = ['title', 'genre', 'plot']
test_df.columns = ['title', 'plot']
train_df.head()

Unnamed: 0,title,genre,plot
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [2]:
print("There are {} training samples and {} test samples".format(len(train_df), len(test_df)))

There are 54214 training samples and 54200 test samples


In [3]:
print("There are {} unique genres in the training set".format(train_df['genre'].nunique()))

There are 27 unique genres in the training set


# Text cleaning and processing steps
1. Remove punctuations
2. Convert text to tokens
3. Remove tokens of length less than or equal to 3
4. Remove stopwords using NLTK corpus stopwords list to match
5. Apply stemming
6. Apply lemmatization
7. Convert words to feature vectors

## 1. Remove punctuations

In [4]:
# from "plot" columns remove any punctuation and convert to lowercase
import string as st 
train_df['no_punct'] = train_df['plot'].str.replace('[{}]'.format(st.punctuation), '').str.lower()
# replace any double-space by single-space
train_df['no_punct'] = train_df['no_punct'].str.replace('  ', ' ')
# remove leading and trailing whitespaces
train_df['no_punct'] = train_df['no_punct'].str.strip()
train_df.head()

Unnamed: 0,title,genre,plot,no_punct
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening in to a conversation between his doc...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,as the bus empties the students for their fiel...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,to help their unemployed father make ends meet...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the film's title refers not only to the un-rec...


## 2. Convert text to tokens

In [5]:
# tokenize the plot_removed_punct column using re package
import re
train_df['tokenized'] = train_df['no_punct'].apply(lambda x: re.split('\s+', x))
train_df.head()

  train_df['tokenized'] = train_df['no_punct'].apply(lambda x: re.split('\s+', x))


Unnamed: 0,title,genre,plot,no_punct,tokenized
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening in to a conversation between his doc...,"[listening, in, to, a, conversation, between, ..."
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous re...,"[a, brother, and, sister, with, a, past, inces..."
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,as the bus empties the students for their fiel...,"[as, the, bus, empties, the, students, for, th..."
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,to help their unemployed father make ends meet...,"[to, help, their, unemployed, father, make, en..."
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the film's title refers not only to the un-rec...,"[the, film's, title, refers, not, only, to, th..."


In [6]:
# nltk tokenization
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
train_df['nltk_tokenized'] = train_df['no_punct'].apply(lambda x: word_tokenize(x))
train_df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\laura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,title,genre,plot,no_punct,tokenized,nltk_tokenized
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening in to a conversation between his doc...,"[listening, in, to, a, conversation, between, ...","[listening, in, to, a, conversation, between, ..."
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous re...,"[a, brother, and, sister, with, a, past, inces...","[a, brother, and, sister, with, a, past, inces..."
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,as the bus empties the students for their fiel...,"[as, the, bus, empties, the, students, for, th...","[as, the, bus, empties, the, students, for, th..."
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,to help their unemployed father make ends meet...,"[to, help, their, unemployed, father, make, en...","[to, help, their, unemployed, father, make, en..."
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the film's title refers not only to the un-rec...,"[the, film's, title, refers, not, only, to, th...","[the, film, 's, title, refers, not, only, to, ..."


## 3. Remove tokens of length less than or equal to 3

In [7]:
# Remove tokens of length less than 3

def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

train_df['long_tokens'] = train_df['nltk_tokenized'].apply(remove_small_words)
train_df.head()

Unnamed: 0,title,genre,plot,no_punct,tokenized,nltk_tokenized,long_tokens
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening in to a conversation between his doc...,"[listening, in, to, a, conversation, between, ...","[listening, in, to, a, conversation, between, ...","[listening, conversation, between, doctor, par..."
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous re...,"[a, brother, and, sister, with, a, past, inces...","[a, brother, and, sister, with, a, past, inces...","[brother, sister, with, past, incestuous, rela..."
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,as the bus empties the students for their fiel...,"[as, the, bus, empties, the, students, for, th...","[as, the, bus, empties, the, students, for, th...","[empties, students, their, field, trip, museum..."
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,to help their unemployed father make ends meet...,"[to, help, their, unemployed, father, make, en...","[to, help, their, unemployed, father, make, en...","[help, their, unemployed, father, make, ends, ..."
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the film's title refers not only to the un-rec...,"[the, film's, title, refers, not, only, to, th...","[the, film, 's, title, refers, not, only, to, ...","[film, title, refers, only, un-recovered, bodi..."


## 4. Remove stopwords using NLTK corpus stopwords list to match

In [8]:
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]
train_df['clean_tokens'] = train_df['long_tokens'].apply(lambda x : remove_stopwords(x))
train_df.head()

Unnamed: 0,title,genre,plot,no_punct,tokenized,nltk_tokenized,long_tokens,clean_tokens
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening in to a conversation between his doc...,"[listening, in, to, a, conversation, between, ...","[listening, in, to, a, conversation, between, ...","[listening, conversation, between, doctor, par...","[listening, conversation, doctor, parents, 10-..."
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous re...,"[a, brother, and, sister, with, a, past, inces...","[a, brother, and, sister, with, a, past, inces...","[brother, sister, with, past, incestuous, rela...","[brother, sister, past, incestuous, relationsh..."
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,as the bus empties the students for their fiel...,"[as, the, bus, empties, the, students, for, th...","[as, the, bus, empties, the, students, for, th...","[empties, students, their, field, trip, museum...","[empties, students, field, trip, museum, natur..."
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,to help their unemployed father make ends meet...,"[to, help, their, unemployed, father, make, en...","[to, help, their, unemployed, father, make, en...","[help, their, unemployed, father, make, ends, ...","[help, unemployed, father, make, ends, meet, e..."
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the film's title refers not only to the un-rec...,"[the, film's, title, refers, not, only, to, th...","[the, film, 's, title, refers, not, only, to, ...","[film, title, refers, only, un-recovered, bodi...","[film, title, refers, un-recovered, bodies, gr..."


## 5. Apply stemming

To stemize: reduce words to their base or root form, called a "stem." This process is applied to words with the goal of grouping together different forms of the same word so they can be analyzed as a single item. For example, the words "running," "runner," and "ran" might all be stemmized to the root "run." 

In [9]:
from nltk import PorterStemmer


In [10]:
def stemming(text):
    ps = PorterStemmer()
    return [ps.stem(word) for word in text]
train_df['stem_words'] = train_df['clean_tokens'].apply(lambda wrd: stemming(wrd))
train_df.head()


Unnamed: 0,title,genre,plot,no_punct,tokenized,nltk_tokenized,long_tokens,clean_tokens,stem_words
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening in to a conversation between his doc...,"[listening, in, to, a, conversation, between, ...","[listening, in, to, a, conversation, between, ...","[listening, conversation, between, doctor, par...","[listening, conversation, doctor, parents, 10-...","[listen, convers, doctor, parent, 10-year-old,..."
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous re...,"[a, brother, and, sister, with, a, past, inces...","[a, brother, and, sister, with, a, past, inces...","[brother, sister, with, past, incestuous, rela...","[brother, sister, past, incestuous, relationsh...","[brother, sister, past, incestu, relationship,..."
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,as the bus empties the students for their fiel...,"[as, the, bus, empties, the, students, for, th...","[as, the, bus, empties, the, students, for, th...","[empties, students, their, field, trip, museum...","[empties, students, field, trip, museum, natur...","[empti, student, field, trip, museum, natur, h..."
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,to help their unemployed father make ends meet...,"[to, help, their, unemployed, father, make, en...","[to, help, their, unemployed, father, make, en...","[help, their, unemployed, father, make, ends, ...","[help, unemployed, father, make, ends, meet, e...","[help, unemploy, father, make, end, meet, edit..."
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the film's title refers not only to the un-rec...,"[the, film's, title, refers, not, only, to, th...","[the, film, 's, title, refers, not, only, to, ...","[film, title, refers, only, un-recovered, bodi...","[film, title, refers, un-recovered, bodies, gr...","[film, titl, refer, un-recov, bodi, ground, ze..."


## 6. Apply lemmatization
Lemmatize: educing words to their lemma or base form. Unlike stemmization, which often simply removes affixes to find the stem, lemmatization involves a more complex analysis to correctly identify the lemma. This process takes into consideration the word's part of speech, its meaning in the sentence, and its morphological analysis to return the word's base or dictionary form. For example, "am," "are," and "is" would all be lemmatized to "be."

In [11]:
from nltk import WordNetLemmatizer

In [12]:
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

train_df['lemma_words'] = train_df['clean_tokens'].apply(lambda x : lemmatize(x))
train_df.head()

Unnamed: 0,title,genre,plot,no_punct,tokenized,nltk_tokenized,long_tokens,clean_tokens,stem_words,lemma_words
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening in to a conversation between his doc...,"[listening, in, to, a, conversation, between, ...","[listening, in, to, a, conversation, between, ...","[listening, conversation, between, doctor, par...","[listening, conversation, doctor, parents, 10-...","[listen, convers, doctor, parent, 10-year-old,...","[listening, conversation, doctor, parent, 10-y..."
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous re...,"[a, brother, and, sister, with, a, past, inces...","[a, brother, and, sister, with, a, past, inces...","[brother, sister, with, past, incestuous, rela...","[brother, sister, past, incestuous, relationsh...","[brother, sister, past, incestu, relationship,...","[brother, sister, past, incestuous, relationsh..."
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,as the bus empties the students for their fiel...,"[as, the, bus, empties, the, students, for, th...","[as, the, bus, empties, the, students, for, th...","[empties, students, their, field, trip, museum...","[empties, students, field, trip, museum, natur...","[empti, student, field, trip, museum, natur, h...","[empty, student, field, trip, museum, natural,..."
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,to help their unemployed father make ends meet...,"[to, help, their, unemployed, father, make, en...","[to, help, their, unemployed, father, make, en...","[help, their, unemployed, father, make, ends, ...","[help, unemployed, father, make, ends, meet, e...","[help, unemploy, father, make, end, meet, edit...","[help, unemployed, father, make, end, meet, ed..."
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the film's title refers not only to the un-rec...,"[the, film's, title, refers, not, only, to, th...","[the, film, 's, title, refers, not, only, to, ...","[film, title, refers, only, un-recovered, bodi...","[film, title, refers, un-recovered, bodies, gr...","[film, titl, refer, un-recov, bodi, ground, ze...","[film, title, refers, un-recovered, body, grou..."


## 7. Apply tags for each word with its part-of-speech tag (e.g. NN)

In [13]:
def get_pos_tag(tokenized_sentence):
    return nltk.pos_tag(tokenized_sentence)

train_df['tags'] = train_df['clean_tokens'].apply(lambda x : get_pos_tag(x))
train_df.head()

Unnamed: 0,title,genre,plot,no_punct,tokenized,nltk_tokenized,long_tokens,clean_tokens,stem_words,lemma_words,tags
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening in to a conversation between his doc...,"[listening, in, to, a, conversation, between, ...","[listening, in, to, a, conversation, between, ...","[listening, conversation, between, doctor, par...","[listening, conversation, doctor, parents, 10-...","[listen, convers, doctor, parent, 10-year-old,...","[listening, conversation, doctor, parent, 10-y...","[(listening, VBG), (conversation, NN), (doctor..."
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous re...,"[a, brother, and, sister, with, a, past, inces...","[a, brother, and, sister, with, a, past, inces...","[brother, sister, with, past, incestuous, rela...","[brother, sister, past, incestuous, relationsh...","[brother, sister, past, incestu, relationship,...","[brother, sister, past, incestuous, relationsh...","[(brother, NN), (sister, NN), (past, IN), (inc..."
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,as the bus empties the students for their fiel...,"[as, the, bus, empties, the, students, for, th...","[as, the, bus, empties, the, students, for, th...","[empties, students, their, field, trip, museum...","[empties, students, field, trip, museum, natur...","[empti, student, field, trip, museum, natur, h...","[empty, student, field, trip, museum, natural,...","[(empties, NNS), (students, NNS), (field, NN),..."
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,to help their unemployed father make ends meet...,"[to, help, their, unemployed, father, make, en...","[to, help, their, unemployed, father, make, en...","[help, their, unemployed, father, make, ends, ...","[help, unemployed, father, make, ends, meet, e...","[help, unemploy, father, make, end, meet, edit...","[help, unemployed, father, make, end, meet, ed...","[(help, NN), (unemployed, VBD), (father, RB), ..."
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the film's title refers not only to the un-rec...,"[the, film's, title, refers, not, only, to, th...","[the, film, 's, title, refers, not, only, to, ...","[film, title, refers, only, un-recovered, bodi...","[film, title, refers, un-recovered, bodies, gr...","[film, titl, refer, un-recov, bodi, ground, ze...","[film, title, refers, un-recovered, body, grou...","[(film, NN), (title, NN), (refers, NNS), (un-r..."


## 8. Convert words to feature vectors

In [14]:
def return_sentences(tokens):
    return " ".join([word for word in tokens])
train_df['clean_plot'] = train_df['lemma_words'].apply(lambda x : return_sentences(x))
train_df.head()

Unnamed: 0,title,genre,plot,no_punct,tokenized,nltk_tokenized,long_tokens,clean_tokens,stem_words,lemma_words,tags,clean_plot
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening in to a conversation between his doc...,"[listening, in, to, a, conversation, between, ...","[listening, in, to, a, conversation, between, ...","[listening, conversation, between, doctor, par...","[listening, conversation, doctor, parents, 10-...","[listen, convers, doctor, parent, 10-year-old,...","[listening, conversation, doctor, parent, 10-y...","[(listening, VBG), (conversation, NN), (doctor...",listening conversation doctor parent 10-year-o...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...,a brother and sister with a past incestuous re...,"[a, brother, and, sister, with, a, past, inces...","[a, brother, and, sister, with, a, past, inces...","[brother, sister, with, past, incestuous, rela...","[brother, sister, past, incestuous, relationsh...","[brother, sister, past, incestu, relationship,...","[brother, sister, past, incestuous, relationsh...","[(brother, NN), (sister, NN), (past, IN), (inc...",brother sister past incestuous relationship cu...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,as the bus empties the students for their fiel...,"[as, the, bus, empties, the, students, for, th...","[as, the, bus, empties, the, students, for, th...","[empties, students, their, field, trip, museum...","[empties, students, field, trip, museum, natur...","[empti, student, field, trip, museum, natur, h...","[empty, student, field, trip, museum, natural,...","[(empties, NNS), (students, NNS), (field, NN),...",empty student field trip museum natural histor...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,to help their unemployed father make ends meet...,"[to, help, their, unemployed, father, make, en...","[to, help, their, unemployed, father, make, en...","[help, their, unemployed, father, make, ends, ...","[help, unemployed, father, make, ends, meet, e...","[help, unemploy, father, make, end, meet, edit...","[help, unemployed, father, make, end, meet, ed...","[(help, NN), (unemployed, VBD), (father, RB), ...",help unemployed father make end meet edith twi...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,the film's title refers not only to the un-rec...,"[the, film's, title, refers, not, only, to, th...","[the, film, 's, title, refers, not, only, to, ...","[film, title, refers, only, un-recovered, bodi...","[film, title, refers, un-recovered, bodies, gr...","[film, titl, refer, un-recov, bodi, ground, ze...","[film, title, refers, un-recovered, body, grou...","[(film, NN), (title, NN), (refers, NNS), (un-r...",film title refers un-recovered body ground zer...


In [17]:
# train_df[['title', 'genre', 'clean_plot']].to_csv('../data/Movies/train_data_cleaned.csv', index=False)

train_df[['title', 'genre', 'clean_plot']].head()

Unnamed: 0,title,genre,clean_plot
0,Oscar et la dame rose (2009),drama,listening conversation doctor parent 10-year-o...
1,Cupid (1997),thriller,brother sister past incestuous relationship cu...
2,"Young, Wild and Wonderful (1980)",adult,empty student field trip museum natural histor...
3,The Secret Sin (1915),drama,help unemployed father make end meet edith twi...
4,The Unrecovered (2007),drama,film title refers un-recovered body ground zer...


In [18]:
train_df.genre.value_counts()

genre
 drama           13613
 documentary     13096
 comedy           7447
 short            5073
 horror           2204
 thriller         1591
 action           1315
 western          1032
 reality-tv        884
 family            784
 adventure         775
 music             731
 romance           672
 sci-fi            647
 adult             590
 crime             505
 animation         498
 sport             432
 talk-show         391
 fantasy           323
 mystery           319
 musical           277
 biography         265
 history           243
 game-show         194
 news              181
 war               132
Name: count, dtype: int64

# 2. PROFILE-BASED RETRIEVAL IMPLEMENTATION
__2.1 - Documents__

Each document is going to be define as a vector depending on the importance of a word in a document.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer


In [15]:
vectorizer = TfidfVectorizer(norm='l2', use_idf=False)
tfidf_vect = vectorizer.fit_transform(train_df['clean_text']).toarray()
tfidf_vect.shape

NameError: name 'TfidfVectorizer' is not defined

In [None]:
#tf-idf vectors matrix
tokens = []
for i, feature in enumerate(vectorizer.get_feature_names_out()):
    tokens.append(feature)
tfidf_matrix = pd.DataFrame(tfidf_vect, columns = tokens)
tfidf_matrix.iloc[:,7000:]

__2.2. Users__

Once we have every document as a vector with the term frecuency, we are going to create a vector to define 5 different users interested in the categories presented in the data. These categories are: 
1. Thriller
2. Drama
3. Romance
4. Comedy
5. Sci-fi


In [None]:
# Definir Users

## Thriller
user1 =  []
 
## Drama
user2 = []  

## Romance
user3 = []

## Comedy
user4 = []

## Sci-fi
user5 = []

In [None]:
topics_interest = ['thriller', 'drama', 'romance', 'comedy', 'sci-fi', 'thriller and drama', 'romance and comedy', 
          'drama and romance', 'drama and sci-fi', 'comedy and sci-fi']
users_keywords = [user1, user2, user3, user4, user5]

These arrays of keywords have been transformed into vectors. Have also been addeed 5 more users interested in more than one topic. To obtain these vectors the average of the two categories has been made. 

In [None]:
users_vectors =[]
for i in users_keywords:
    users_vectors.append(vectorizer.transform([" ".join(i)]))

#we add the vectors with the combinations of two topics
#user 6 - thriller and drama
users_vectors.append((users_vectors[0]+users_vectors[1])/2)
#user 7 - romance and comedy
users_vectors.append((users_vectors[2]+users_vectors[3])/2)
#user 8 - drama and romance
users_vectors.append((users_vectors[1]+users_vectors[2])/2)
#user 9 - drama and historical
users_vectors.append((users_vectors[1]+users_vectors[4])/2)
#user 10 thriller and historical
users_vectors.append((users_vectors[3]+users_vectors[4])/2)