In [1]:
import data_analysis
import os
from shutil import copy
import pickle
import pandas as pd
import math

In [2]:
# 0_label: suitable for children
# 1_label: not suitable for children 

def get_percentage_of_labeled_data(labeled_0_words_count, labeled_1_words_count):
    """
    Funtion to get the percentage of a labeled dataset out of the total corpus
    :param labeled_0_words_count: count of words in label_0 docs
    :param labeled_1_words_count: count of words in label_1 docs
    :return: percentage_labeled_0, percentage_labeled_1
    """
    total = labeled_0_words_count + labeled_1_words_count
    return float("%.3f" % (labeled_0_words_count / total)), \
           float("%.3f" % (labeled_1_words_count / total))


def get_average_docs_length(docs_info_lst):
    """
    Function to get the average length of 0, 1 labeled docs in terms of words
    :param docs_info_lst: lst of tuples of the form [(words_count, movie_id, rating), ..]
    :return: average_docs_length 
    """
    total = 0
    for file in docs_info_lst:
        total += file[0]
    return int(total / len(docs_info_lst))


def get_dict_of_words_totalCount_docsCount(data_dictionary, label):
    """ 
    Function to make a dict that represents words in each labeled data
    :param data_dictionary: dict of the form {"label_#: {"movie_id": ["w_0", "w_1", ...]}, .. }
           label: string, 0_label or 1_label
    :return: words_dict: a dict of the form: 
                        {word_0: [number_of_total_occurrence_in_a_labeled_data, 
                                 {movie_0: [number_of_occurrence_in_movie_0, 
                                  ... }
                                ]
                         ...
                         }         
    """
    words_dict = {}
    # For each movie
    for movie_name in data_dictionary[label]:
        # For each word in this movie
        for word in data_dictionary[label][movie_name]:
            # If word is already exists in the dict
            if word in words_dict:
                # Increase the count of this word in teh total count among all docs
                words_dict[word][0] += 1

                # Increasing the count of this word in each doc, "{movie_0: [doc_1_count, ..], ..}"
                if movie_name in words_dict[word][1]:
                    #
                    (words_dict[word][1])[movie_name] += 1
                else:
                    # Initialize
                    (words_dict[word][1])[movie_name] = 1
            # If word is not found in the dict before, initialize
            else:
                words_dict[word] = [1, {movie_name: 1}]
    return words_dict

def out_pickle(pickle_path, pick_name, variable_name):
    """
    Function that pickles out a variable
    :param pickle_path: in pickle's path
    :param pick_name: in pickle's name 'without .pkl extension'
    :param variable_name: the variable to be pickled out
    :return: nothing
    """
    with open(pickle_path + pick_name + ".pkl", "wb") as pkl:
        pickle.dump(variable_name, pkl)

def in_pickle(pickle_path, pick_name):
    """
    Function that pickles in a variable
    :param pickle_path: in pickle's path
    :param pick_name: in pickle's name 'without .pkl extension'
    :return: the variable that contains pickled in data
    """
    with open(pickle_path + pick_name + ".pkl", "rb") as pkl:
        return pickle.load(pkl)


In [3]:
%%time
# {"label_#: {"movie_id": ["w_0", "w_1", ...]}, .. }
data_dictionary = in_pickle('data/', 'all_labels_dict')

CPU times: user 3.97 s, sys: 976 ms, total: 4.95 s
Wall time: 4.98 s


In [4]:
pandas_dict = {}

pandas_dict["rating"] = []
pandas_dict ["movie_id"] = []
pandas_dict ["movie_words"] = []

In [7]:
%%time
for label in list(data_dictionary.keys())[:2]:
    for movie_name in data_dictionary[label]:
        pandas_dict["rating"].append(label)
        pandas_dict["movie_words"].append(data_dictionary[label][movie_name])
        pandas_dict["movie_id"].append(movie_name)

CPU times: user 2.03 ms, sys: 504 µs, total: 2.53 ms
Wall time: 2.54 ms


In [8]:
# row: index
# columns : rating, movie_id, [word_1, word_2, ...]
data_frames = pd.DataFrame(data=pandas_dict,
                           columns=["rating", "movie_id", "movie_words"])

In [9]:
%%time
out_pickle('data/', 'all_labels_dict', data_dictionary)

CPU times: user 8.18 s, sys: 892 ms, total: 9.07 s
Wall time: 10.8 s


In [10]:
out_pickle('data/', 'pandas_data_frame', data_frames)

In [12]:
data_frames = in_pickle('data/', 'pandas_data_frame')

In [13]:
'''
pickle_in = open('C:/Users/Abdo/Project/pickle_files/word2vec_model_trained_150.pkl','rb')
model = pickle.load(pickle_in)
'''

In [14]:
# wCount_movieId_rating_lst: list of tuples: [ (words_count, movie_id, rating), ..... ] of the corpus "both labels"
wordCount_movieId_rating_lst = []

for index, row in data_frames.iterrows():
    movie_id = row["movie_id"]
    rating = row["rating"]
    length = len(row["movie_words"])
    wordCount_movieId_rating_lst.append((length, movie_id, rating))
    wordCount_movieId_rating_lst.sort()

In [15]:
# Number of 0 & 1 labeled docs
number_of_0_label_docs = len(data_dictionary["0_label"])
number_of_1_label_docs = len(data_dictionary["1_label"])

# percentage of 0, 1 labeled docs among the whole corpus
percentage_0_labeled_docs, percentage_1_labeled_docs = \
    get_percentage_of_labeled_data(number_of_0_label_docs, number_of_1_label_docs)

# find the max, and min number of words among all documents
min_number_of_words = wordCount_movieId_rating_lst[0]
max_number_of_words = wordCount_movieId_rating_lst[-1]

# average docs length in terms of words 
average_docs_length = get_average_docs_length(wordCount_movieId_rating_lst)

(number_of_0_label_docs,
 number_of_1_label_docs, 
 percentage_0_labeled_docs,
 percentage_1_labeled_docs,
 average_docs_length)
'''print(pd.DataFrame(data=wordCount_movieId_rating_lst, 
                   columns=["wordCount", "movieId", "rating"]))
'''

'print(pd.DataFrame(data=wordCount_movieId_rating_lst, \n                   columns=["wordCount", "movieId", "rating"]))\n'

In [16]:
'''
# from the word2vec model
vocabulary = set(model.wv.vocab)
'''

'\n# from the word2vec model\nvocabulary = set(model.wv.vocab)\n'

In [17]:
# dict of both 0, 1 labeled data
dict_of_1_label_words = get_dict_of_words_totalCount_docsCount(data_dictionary, "1_label")
dict_of_0_label_words = get_dict_of_words_totalCount_docsCount(data_dictionary, "0_label")

In [18]:
label_0_keys = list(dict_of_0_label_words.keys())
label_1_keys = list(dict_of_1_label_words.keys())

In [19]:
(len(label_0_keys), len(label_1_keys))

(76587, 148973)

In [26]:
out_pickle('data/', 'dict_of_1_label_words', dict_of_1_label_words)
out_pickle('data/', 'dict_of_0_label_words', dict_of_0_label_words)

In [30]:
%%time
dict_of_1_label_words = in_pickle('data/', 'dict_of_1_label_words')
dict_of_0_label_words = in_pickle('data/', 'dict_of_0_label_words')

CPU times: user 2.52 s, sys: 43.6 ms, total: 2.56 s
Wall time: 2.58 s


In [13]:
inappropriate_words = []
# key : inappropriate_word
#value : [total_occurences, {movie_id: occurenes_in_movie,movie_id: occurenes_in_movie,.....}]
inappropriate_words_in_0_label_movies = {}

In [14]:
file = open("C:/Users/Abdo/Project/Notes/inappropriate_words.txt","r")
for word in file.readlines():
    inappropriate_words.append(word.replace("\n",""))

In [15]:
#fill the values 
for inappropriate_word in inappropriate_words:
    if inappropriate_word in dict_of_0_label_words:
        inappropriate_words_in_0_label_movies[inappropriate_word] = dict_of_0_label_words[inappropriate_word]
    else:
        inappropriate_words_in_0_label_movies[inappropriate_word] = [0,[]]

In [17]:
#with open('C:/Users/Abdo/Project/pickle_files/inappropriate_words_in_0_label_movies.pkl', 'wb') as output:
    #pickle.dump(inappropriate_words_in_0_label_movies, output)

In [None]:
#pickle_in = open('C:/Users/Abdo/Project/pickle_files/inappropriate_words_in_0_label_movies.pkl','rb')
#inappropriate_words_in_0_label_movies = pickle.load(pickle_in)

In [20]:
#total_words_in_all_movies
all_text_words = set(dict_of_1_label_words.keys())
all_text_words.update(set(dict_of_0_label_words.keys()))

In [21]:
common_words = set(dict_of_1_label_words.keys()).intersection(set(dict_of_0_label_words.keys()))

In [22]:
percentage_of_common_words = len(common_words)/len(all_text_words)

In [23]:
unique_words_in_0_label = set(dict_of_0_label_words.keys()).difference(common_words)
unique_words_in_1_label = set(dict_of_1_label_words.keys()).difference(common_words)

In [64]:
# key : word
# value : [ [total frequency, {movie_name: frequency_of_word_in_movie},..] , .... ] 
frequency_of_unique_words_in_0_label = {}
frequency_of_unique_words_in_1_label = {}

In [65]:
for movie_name in data_dictionary["0_label"]:
    for word in data_dictionary["0_label"][movie_name]:
        if word in unique_words_in_0_label:
            if word in frequency_of_unique_words_in_0_label:
                frequency_of_unique_words_in_0_label[word][0] = frequency_of_unique_words_in_0_label[word][0]+1
                if movie_name in frequency_of_unique_words_in_0_label[word][1]:
                    frequency_of_unique_words_in_0_label[word][1][movie_name] = \
                    frequency_of_unique_words_in_0_label[word][1][movie_name]+1
                else:
                    frequency_of_unique_words_in_0_label[word][1][movie_name] = 1
            else:
                frequency_of_unique_words_in_0_label[word] = [1, {movie_name:1}]
        

In [72]:
for movie_name in data_dictionary["1_label"]:
    for word in data_dictionary["1_label"][movie_name]:
        if word in unique_words_in_1_label:
            if word in frequency_of_unique_words_in_1_label:
                frequency_of_unique_words_in_1_label[word][0] = frequency_of_unique_words_in_1_label[word][0]+1
                if movie_name in frequency_of_unique_words_in_1_label[word][1]:
                    frequency_of_unique_words_in_1_label[word][1][movie_name] = \
                    frequency_of_unique_words_in_1_label[word][1][movie_name]+1
                else:
                    frequency_of_unique_words_in_1_label[word][1][movie_name] = 1
            else:
                frequency_of_unique_words_in_1_label[word] = [1, {movie_name:1}]

In [73]:
#with open('C:/Users/Abdo/Project/pickle_files/frequency_of_unique_words_in_1_label.pkl', 'wb') as output:
    #pickle.dump(frequency_of_unique_words_in_1_label, output)

In [74]:
#with open('C:/Users/Abdo/Project/pickle_files/frequency_of_unique_words_in_0_label.pkl', 'wb') as output:
    #pickle.dump(frequency_of_unique_words_in_0_label, output)

In [None]:
#pickle_in = open('C:/Users/Abdo/Project/pickle_files/frequency_of_unique_words_in_0_label.pkl','rb')
#frequency_of_unique_words_in_0_label = pickle.load(pickle_in)

In [None]:
#pickle_in = open('C:/Users/Abdo/Project/pickle_files/frequency_of_unique_words_in_1_label.pkl','rb')
#frequency_of_unique_words_in_1_label = pickle.load(pickle_in)

In [75]:
# [(frequency, word), ....]
list_of_uinque_0_label_words_and_frequencies = []
list_of_uinque_1_label_words_and_frequencies = []

In [76]:
for word in frequency_of_unique_words_in_0_label:
    list_of_uinque_0_label_words_and_frequencies.append( (frequency_of_unique_words_in_0_label[word][0], word) )

In [77]:
for word in frequency_of_unique_words_in_1_label:
    list_of_uinque_1_label_words_and_frequencies.append( (frequency_of_unique_words_in_1_label[word][0], word) )

In [78]:
list_of_uinque_0_label_words_and_frequencies.sort()
list_of_uinque_1_label_words_and_frequencies.sort()

In [81]:
#with open('C:/Users/Abdo/Project/pickle_files/list_of_uinque_0_label_words_and_frequencies.pkl', 'wb') as output:
    #pickle.dump(list_of_uinque_0_label_words_and_frequencies, output)

In [82]:
#with open('C:/Users/Abdo/Project/pickle_files/list_of_uinque_1_label_words_and_frequencies.pkl', 'wb') as output:
    #pickle.dump(list_of_uinque_1_label_words_and_frequencies, output)

In [24]:
pickle_in = open('C:/Users/Abdo/Project/pickle_files/list_of_uinque_1_label_words_and_frequencies.pkl','rb')
list_of_uinque_1_label_words_and_frequencies = pickle.load(pickle_in)

In [25]:
pickle_in = open('C:/Users/Abdo/Project/pickle_files/list_of_uinque_0_label_words_and_frequencies.pkl','rb')
list_of_uinque_0_label_words_and_frequencies = pickle.load(pickle_in)

In [87]:
# the percentage of bad words in every 0_labeled movie
# key : movie_id
# value : percentage_of_bad_words
percentage_of_bad_words_in_0_labeld_movies = {}

In [89]:
for movie_name in data_dictionary["0_label"]:
    movie_words_count = len(data_dictionary["0_label"][movie_name])
    number_of_bad_words = 0
    for bad_word in inappropriate_words_in_0_label_movies:
        if movie_name in inappropriate_words_in_0_label_movies[bad_word][1]:
            number_of_bad_words = number_of_bad_words + \
            inappropriate_words_in_0_label_movies[bad_word][1][movie_name]
    percentage_of_bad_words_in_0_labeld_movies[movie_name] = number_of_bad_words/movie_words_count

In [99]:
#with open('C:/Users/Abdo/Project/pickle_files/percentage_of_bad_words_in_0_labeld_movies.pkl', 'wb') as output:
    #pickle.dump(percentage_of_bad_words_in_0_labeld_movies, output)

In [26]:
pickle_in = open('C:/Users/Abdo/Project/pickle_files/percentage_of_bad_words_in_0_labeld_movies.pkl','rb')
percentage_of_bad_words_in_0_labeld_movies = pickle.load(pickle_in)

In [27]:
# turn the dictionary into list to sort it
# [ (percentage_of_bad_words_in_movie, movie_name) ]
percentage_of_bad_words_in_0_labeld_movies_list = []
for movie_name in percentage_of_bad_words_in_0_labeld_movies:
    percentage_of_bad_words_in_0_labeld_movies_list.append\
        ( (percentage_of_bad_words_in_0_labeld_movies[movie_name], movie_name) )

In [28]:
percentage_of_bad_words_in_0_labeld_movies_list.sort()

In [29]:
print(percentage_of_bad_words_in_0_labeld_movies_list[-1])

(0.019112874143526866, 'PG_4918')
