In [15]:
import pandas as pd
import os

In [8]:
yelp_df = pd.read_csv('train.csv', names=['label', 'review'])
imdb_df = pd.read_csv('imdb.csv')
sst_df = pd.concat([pd.read_csv('SST-2/train.tsv', sep='\t'),
                   pd.read_csv('SST-2/dev.tsv', sep='\t')])

imdb_df.columns = ['review', 'label']
sst_df.columns = ['review', 'label']

In [9]:
# yelp
yelp_df.drop_duplicates(inplace=True)
yelp_df.reset_index(inplace=True, drop=True)
yelp_df.label -= 1
# take only 80000 data randomly
yelp_df = yelp_df.sample(n=80000, random_state=42)

# imdb
imdb_df.drop_duplicates(inplace=True)
imdb_df.reset_index(inplace=True, drop=True)
imdb_df.label.replace({'positive': 1, 'negative': 0}, inplace=True)
label = imdb_df.pop('label')
imdb_df.insert(0, 'label', label)

# sst
sst_df.drop_duplicates(inplace=True)
sst_df.reset_index(inplace=True, drop=True)
label = sst_df.pop('label')
sst_df.insert(0, 'label', label)

In [10]:
len(yelp_df), len(imdb_df), len(sst_df)

(80000, 49582, 67855)

In [11]:
df = pd.concat([yelp_df, imdb_df, sst_df])

In [12]:
df

Unnamed: 0,label,review
34566,1,This place is one of my favorite comic shops. ...
223092,0,The wait time for an appointment is ridiculous...
110270,0,I did not like this hotel at all. It's very ol...
365013,1,Mill Avenue has a serious issue with parking. ...
311625,1,Favorite sushi place in NV! Price is reasonab...
...,...,...
67850,0,has all the depth of a wading pool .
67851,1,a movie with a real anarchic flair .
67852,0,a subject like this should inspire reaction in...
67853,0,... is an arthritic attempt at directing by ca...


In [16]:
def generate_df(df, dataset):
    p_1 = int(len(df) * 0.01)
    p_0_1 = p_1 // 10
    p_0_5 = p_0_1 * 5

    if not os.path.exists('data'):
        os.makedirs('data')

    # 1%
    df.iloc[:p_1, :].to_csv(f'data/{dataset}_train_1_percent.csv',
                            header=False, index=False)
    df.iloc[p_1:, :].to_csv(f'data/{dataset}_1_percent.csv', header=False, index=False)

    # 0.5%
    df.iloc[:p_0_5, :].to_csv(
        f'data/{dataset}_train_0_5_percent.csv', header=False, index=False)
    df.iloc[p_0_5:, :].to_csv(f'data/{dataset}_0_5_percent.csv',
                            header=False, index=False)

    # 0.1%
    df.iloc[:p_0_1, :].to_csv(
        f'data/{dataset}_train_0_1_percent.csv', header=False, index=False)
    df.iloc[p_0_1:, :].to_csv(f'data/{dataset}_0_1_percent.csv',
                            header=False, index=False)



In [17]:
generate_df(df, 'data')
generate_df(yelp_df, 'yelp')
generate_df(imdb_df, 'imdb')
generate_df(sst_df, 'sst')