<a href="https://colab.research.google.com/github/GuFukuRo/movie-screenplays/blob/main/DataLabels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pickle
import pandas as pd
import os
from itertools import chain

In [None]:
config = {
    'paths':{ 
       'movie_meta_data': '/content/drive/MyDrive/Movie scripts dataset/Movie meta data/movie_meta_data.csv',
       'movies_matching_scores' : '/content/drive/MyDrive/Movie scripts dataset/Movie characters/Matching evaluation and statistics/movies_mean_matching_scores.xlsx',
       'nominated_movies': '/content/drive/MyDrive/Movie scripts dataset/BERT training data/Screenplay awards data/all_awards_movies.pickle',
       'scripts': '/content/drive/MyDrive/Movie scripts dataset/Movie scripts and annotations/Scripts'
       },
       'task':{
           'all': ('script_awards', 'meta_scores', 'year', 'genre'),
           'name': 'script_awards' # from ['meta_scores', 'script_awards']
       }
}

In [None]:
movie_meta_df = pd.read_csv(config['paths']['movie_meta_data'])
print(movie_meta_df.columns)

In [None]:
movie_meta_df.genres = movie_meta_df.genres.apply(lambda x: x.split(', ') if x==x else [''])
print(list(chain(*movie_meta_df.genres.values)))

In [None]:
all_genres = set(list(chain(*movie_meta_df.genres.values)))

In [None]:
print(len(all_genres))
print(all_genres)

In [None]:
print(movie_meta_df.year.drop_duplicates().values)

In [None]:
class DataLabels:
    def __init__(self, config):
        self.config = config
    
    def get_movie_meta_scores(self):
        movie_meta_df = pd.read_csv(self.config['paths']['movie_meta_data'])
        movie_meta_dict = dict(zip(movie_meta_df['imdbid'].tolist(), 
                                   movie_meta_df['metascore'].tolist()))    
        return movie_meta_dict
        
    def get_movie_years(self):
        movie_meta_df = pd.read_csv(self.config['paths']['movie_meta_data'])
        movie_meta_dict = dict(zip(movie_meta_df['imdbid'].tolist(), 
                                   movie_meta_df['year'].tolist()))   
        return movie_meta_dict

    def get_movie_genre(self):
        movie_meta_df = pd.read_csv(self.config['paths']['movie_meta_data'])
        movie_meta_df.genres = movie_meta_df.genres.apply(lambda x: x.split(', ')[0] if x==x else -1)
        movie_meta_dict = dict(zip(movie_meta_df['imdbid'].tolist(), 
                                   movie_meta_df['genres'].tolist()))   
        return movie_meta_dict

    def get_movie_script_awards(self):
        with open(config['paths']['nominated_movies'], 'rb') as f:
            nominated_movies = pickle.load(f)
        nominated_movies_imdb_ids = [x.split('_')[1].split('.')[0] for x in nominated_movies]
        all_movie_scripts = os.listdir(self.config['paths']['scripts'])
        imdb_ids = [x.replace('.txt','').split('_')[-1] for x in all_movie_scripts]
        print('len imdb_ids', len(imdb_ids))
        not_nominated_movies_imdb_ids = [x for x in imdb_ids if not x in nominated_movies_imdb_ids]
        return nominated_movies_imdb_ids, not_nominated_movies_imdb_ids

    def get_labels(self, task):
        if task=='meta_scores':
            movie_meta_dict = self.get_movie_meta_scores()
            #movie_meta_dict = dict([(key, value) for key, value in movie_meta_dict.items()])
            return movie_meta_dict
        elif task=='script_awards':
            nominated_movies_imdb_ids, not_nominated_movies_imdb_ids = self.get_movie_script_awards()
            scripts_award_dict = list(zip(nominated_movies_imdb_ids, [1] * len(nominated_movies_imdb_ids)))
            scripts_award_dict += list(zip(not_nominated_movies_imdb_ids, [0] * len(not_nominated_movies_imdb_ids)))
            scripts_award_dict = dict(scripts_award_dict)
            return scripts_award_dict
        elif task=='year':
            movie_years_dict = self.get_movie_years()
            return movie_years_dict
        elif task=='genre':
            movie_genres_dict = self.get_movie_genre()
            return movie_genres_dict


In [None]:
DL = DataLabels(config)

In [None]:
task_to_labels_dicst = {}
for task in config['task']['all']:
    labels_dict = DL.get_labels(task)
    task_to_labels_dicst[task] = labels_dict

print(task_to_labels_dicst)

In [None]:
print(len(task_to_labels_dicst['year']))
print(task_to_labels_dicst['year'])

In [None]:
with open('/content/drive/MyDrive/Movie scripts dataset/BERT training data/Script texts/script_task_to_labels_dicts.pickle', 'wb') as f:
    pickle.dump(task_to_labels_dicst, f)