In [1]:
import os
import json
import pandas as pd

## Read the reviews

The test dataset contains reviews, one in each file, which must be read and saved so that it can be added to a dataframe later. NB: the files in the `./norec` folder were cloned from github using the command `git clone https://github.com/ltgoslo/norec`

In [2]:
def read_reviews(folderpath: str): 
    """
    Read the text from all files in the folder. 
    Args: 
        folderpath: the path to the folder with all the files
    Returns:
        list[str] reviews: containing the entire text, each index corresponds to the text in one file.
        list[str] filenames: containing the filenames of all files read
    """

    reviews = []
    filenames = []

    for filename in os.listdir(folderpath):
        with open(os.path.join(folderpath, filename), 'r', encoding='utf-8') as f:
            try:
                text = f.read()
                reviews.append(text)
                # Keep the filename so that it can be used to find the correct score in metadata.json:
                filenames.append(filename)
            except:
                print(f'Error in file {filename}')
    
    return reviews, filenames

## Get the scores and make binary labels

Retrieve the correct scores from the `metadata.json` file and turn all scores from 1-3 to 0, and all from 4-6 to 1. Also, the categories must be kept so that we can filter on those later. 

In [3]:
def get_scores_and_categories(filenames: list[str]):
    """
    Function to get scores and categories for all reviews read. 
    Args:
        filenames: the filename of each review read, to get the correct values from the metadata file
    Returns:
        list[str] scores: the score of each review 
        list[str] categories: the category of each thing being reviewed
    """

    scores = []
    categories = []

    with open('./norec/data/metadata.json', 'r', encoding='utf-8') as f:
        data = json.load(f)

        for filename in filenames:
            obj = data[filename[0:6]]
            scores.append(obj['rating'])
            categories.append(obj['category'])
    
    return scores, categories

In [4]:
def get_full_dataframe(reviews: list[str], scores: list[str], categories: list[str], filenames: list[str]) -> pd.DataFrame:
    """
    Create a dataframe on the format |text|label|category|filename|
    """
    df = pd.DataFrame({'text': reviews, 'label': scores, 'category': categories, 'filename': filenames})

    # In this experiment only reviews in the category 'screen' are used, because it's the same domain as the IMDB dataset. 
    df = df.loc[df['category'] == 'screen']
    
    # Labels should be either 0 or 1
    df.loc[df['label'] <= 3, 'label'] = 0
    df.loc[df['label'] >= 4, 'label'] = 1
    
    return df    

### Create test dataset

In [5]:
reviews, filenames = read_reviews('./norec/data/test')
scores, categories = get_scores_and_categories(filenames)
df = get_full_dataframe(reviews, scores, categories, filenames)
df.head()

Unnamed: 0,text,label,category,filename
0,Outlander S02 E01 - 02\nFortsatt et eventyr du...,1,screen,000298.txt
1,Fear the Walking Dead S02 E01 - E02\nDårlige r...,0,screen,000299.txt
2,Younger S01 E01 - E08\n40-årskrisa på sitt mes...,0,screen,000303.txt
3,Marseille S01 E01 - E05\nValgkamp med såpesmak...,0,screen,000304.txt
4,Galavant S01 E01 - 04\nFin fantasy-musikal frå...,1,screen,000305.txt


In [6]:
df.value_counts('label')

label
1    1013
0     416
dtype: int64

Create a dataframe consisting of only the text and the label, and save this as a csv file so that it can be easily used by the other files training and evaluating the models. 

In [7]:
norec_df = df[['text', 'label']].copy()
norec_df.to_csv('norec_test.csv', index=False)

In [8]:
norec_df.head()

Unnamed: 0,text,label
0,Outlander S02 E01 - 02\nFortsatt et eventyr du...,1
1,Fear the Walking Dead S02 E01 - E02\nDårlige r...,0
2,Younger S01 E01 - E08\n40-årskrisa på sitt mes...,0
3,Marseille S01 E01 - E05\nValgkamp med såpesmak...,0
4,Galavant S01 E01 - 04\nFin fantasy-musikal frå...,1


In [9]:
norec_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1429 entries, 0 to 4103
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1429 non-null   object
 1   label   1429 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 33.5+ KB


### Create training dataset

To be able to compare a model trained on english and norwegian, a dataset is also needed for training. 

In [10]:
reviews, filenames = read_reviews('./norec/data/train')
scores, categories = get_scores_and_categories(filenames)
df_train = get_full_dataframe(reviews, scores, categories, filenames)
df_train.head()

Unnamed: 0,text,label,category,filename
0,Rome S02\nToppen innen tv-drama akkurat nå! \n...,1,screen,000000.txt
1,Twin Peaks - definitive gold box edition\nGull...,1,screen,000001.txt
2,The Wire (sesong 1-4)\nThe Wire vil gjøre deg ...,1,screen,000002.txt
3,"Mad Men (sesong 1)\nStilig, underholdende og s...",1,screen,000003.txt
4,Mad Men (sesong 2)\nTV-underholdning av høyest...,1,screen,000004.txt


In [11]:
df_train.value_counts('category')

category
screen    11439
dtype: int64

In [12]:
df_train.value_counts('label')

label
1    7474
0    3965
dtype: int64

In [13]:
norec_df_train = df_train[['text', 'label']].copy()
norec_df_train.to_csv('norec_dataset_train.csv', index=False)

In [14]:
# Only 1000 will be used for training
norec_train_small = norec_df_train.sample(n=1000, random_state=3)
norec_train_small.to_csv('norec_train_small.csv', index=False)

In [15]:
norec_train_small.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 3200 to 13723
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1000 non-null   object
 1   label   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 23.4+ KB


### Create evaluation dataset

This dataset will be used to evaluate the model suring training. 

In [16]:
reviews, filenames = read_reviews('./norec/data/dev')
scores, categories = get_scores_and_categories(filenames)
df_eval = get_full_dataframe(reviews, scores, categories, filenames)
df_eval.head()

Unnamed: 0,text,label,category,filename
0,Penny Dreadful S02 E01\nEn god fortsettelse på...,1,screen,000227.txt
1,CSI:Cyber S01 E01-02 \n\nKybernetisk kalkun ut...,0,screen,000228.txt
2,Wayward Pines S01 E01-05\nStarter med et smell...,0,screen,000229.txt
3,Between S01 E01\nLovende tenåringsdrama med nå...,1,screen,000231.txt
4,Sense8 S01 E01-03\nEt ambisiøst sci-fi-eventyr...,0,screen,000232.txt


In [17]:
df_eval.value_counts('category')

category
screen    1429
dtype: int64

In [18]:
df_eval.value_counts('label')

label
1    944
0    485
dtype: int64

In [19]:
norec_df_eval = df_eval[['text', 'label']].copy()
norec_df_eval.to_csv('norec_dataset_eval.csv', index=False)

In [20]:
# Only 200 will be used for evaluation
norec_eval_small = norec_df_train.sample(n=200, random_state=3)
norec_eval_small.to_csv('norec_eval_small.csv', index=False)

In [22]:
norec_eval_small.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 3200 to 5561
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    200 non-null    object
 1   label   200 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.7+ KB
