# Data Sampling
**`Goal:`** Draw a sample of the merged dataframe to label for a supervised learning task

### 1. Library Importation

In [2]:
import pandas as pd
import numpy as np

### 2. Defining sampler

In [3]:
def sample_generator(filepath,pct_of_file= 0.25):
    
    """
    Return a sample of the file, where the sample size is determined by the pct_of_file specified
    
    Inputs:
        - filename (str): Path to file to be read
        - pct_of_file (float): Percentage of the original file to sample
        
    Output:
        - df (pandas Dataframe): Return the random sample
    
    """
    
    #Load a sample of the dataset
    #If randomly generated probability is greater than pct_of_file, the row will be skipped
    df = pd.read_csv(filepath, 
                     skiprows = lambda l: l>0 and np.random.random() > pct_of_file,
                     low_memory=False)
    
    print(f"Sampling complete. Sampled {len(df)} rows from {len(pd.read_csv(filepath))} rows")
    
    return df

### 3. Generate Sample From the Dataframe

In [35]:
sampled_df = sample_generator('../data/raw/merged.csv')

Sampling complete. Sampled 434 rows from 1646 rows


#### a. Sample for language labelling

In [8]:
#Draw a random sample for annotation - to determine proportion of pidgin English texts
language_labelling = sample_generator('../data/raw/merged.csv',0.05)

Sampling complete. Sampled 82 rows from 1646 rows


#### b. Sample For further aspect based sentiment analysis annotation

In [7]:
#Note: I sample a huge chunk of the data – this is just to have enough new ones as there 
#will be some overlap with the tweets I've already annotated. Most likely, all of it will not be used
sample_for_absa_annotation = sample_generator('../data/raw/merged.csv', 0.5)

Sampling complete. Sampled 826 rows from 1646 rows


In [6]:
sample_for_absa_annotation

Unnamed: 0,ISP_Name,Time,Text,Coordinates,Place,Source
0,sprectranet,2020-04-21 06:11:55+00:00,@Spectranet_NG is this even fair? I won’t rene...,,Place(_api=<tweepy.api.API object at 0x7fbc03d...,Twitter for iPhone
1,sprectranet,2019-02-16 18:11:48,@Spectranet_NG Can I subscribe via @UBAGroup m...,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for Android
2,sprectranet,2020-10-20 08:36:06+00:00,If Spectranet wants to be shit like this eh th...,,Place(_api=<tweepy.api.API object at 0x7fbc029...,Twitter for iPhone
3,tizeti,2020-07-08 05:37:08+00:00,@igalaman @tizeti No one. And they will still ...,,Place(_api=<tweepy.api.API object at 0x7fe2d86...,Twitter for Android
4,tizeti,2019-07-28 22:13:53+00:00,TIZETI PLEASE,,Place(_api=<tweepy.api.API object at 0x7fe2d6d...,Twitter for iPhone
...,...,...,...,...,...,...
422,sprectranet,2020-04-28 11:51:58+00:00,Spectranet/MTNN is making WFH today super stre...,,Place(_api=<tweepy.api.API object at 0x7fbc03d...,Twitter for Android
423,sprectranet,2020-04-03 12:55:27+00:00,"50GB gone in one week, @Spectranet_NG na so? I...",,Place(_api=<tweepy.api.API object at 0x7fbc03d...,Twitter for Android
424,sprectranet,2019-03-02 15:08:26,I can’t find my @Spectranet_NG mifi and I sti...,,Place(_api=<tweepy.api.API object at 0x7f96f2f...,Twitter for iPhone
425,sprectranet,2020-10-30 00:19:29+00:00,Spectranet is always terrible at night. Fix up...,,Place(_api=<tweepy.api.API object at 0x7fbc029...,Twitter for iPhone


### 4. Save Sample Dataframe to CSV File
This CSV will be subsequently annotated

In [10]:
save_file_with_name = lambda df,name: df.to_csv(f"../data/raw/{name}.csv",index=False)

save_file_with_name(sampled_df,'sample')

sampled_df[['Text']].rename(columns={'Text':'text'}).to_csv(f"../data/raw/sample_tweet_text.csv",
                                                            index=False)

#Save the dataset for language annotation
language_labelling[['Text']].rename(columns={'Text':'text'}).to_csv(f"../data/raw/lang_labelling_sample.csv", index=False)

#Save the dataset for further absa annotation
sample_for_absa_annotation[['Text']].to_csv(f"../data/raw/sample_for_absa_annotation.csv", index=False)