## Converting the .txt file into on csv file
There are 50K instances for both training and testing, 25k each. The testing data is also as large as the training data, here I will merge into one csv file so later on we can split the training and testing data into any size we want.

In [2]:
import pandas as pd
import os

DATADIR = "./imdb_ckb"

In [3]:
def read_text(filename):
    with open(filename) as f: 
        data = f.read()
    return data

def get_imdb_df(datadir,val=None):
    data = [(read_text(datadir + filename),val) for filename in os.listdir(datadir)]
    return pd.DataFrame(data,columns=['review','sentiment'])    

In [4]:
def merge_files_into_csv(train_dir = DATADIR + "/train", test_dir = DATADIR + "/test"):
    
    pos = get_imdb_df(train_dir + "/pos/",1)
    neg = get_imdb_df(train_dir + "/neg/",0)
    train = pd.concat([pos, neg])
    print("IMDB train data size: %d "%(len(train)))

    pos = get_imdb_df(test_dir + "/pos/",1)
    neg = get_imdb_df(test_dir + "/neg/",0)
    test = pd.concat([pos, neg])
    print("IMDB test data size: %d "%(len(test)))
    
    df = pd.concat([train, test])
    df.to_csv("IMDB_Sorani_Reviews.csv", index = None, header=True)

In [15]:
merge_files_into_csv()

IMDB train data size: 25000 
IMDB test data size: 25000 


In [5]:
def train_test_split(train_dir = DATADIR + "/train", test_dir = DATADIR + "/test"):
    
    pos = get_imdb_df(train_dir + "/pos/",1)
    neg = get_imdb_df(train_dir + "/neg/",0)
    train = pd.concat([pos, neg])
    print("IMDB train data size: %d "%(len(train)))
    train.to_csv("train.csv", index = None, header=True)

    pos = get_imdb_df(test_dir + "/pos/",1)
    neg = get_imdb_df(test_dir + "/neg/",0)
    test = pd.concat([pos, neg])
    print("IMDB test data size: %d "%(len(test)))
    test.to_csv("test.csv", index = None, header=True)


In [6]:
train_test_split()

IMDB train data size: 24903 
IMDB test data size: 24692 
