# Data Sampling
**`Goal:`** Draw a sample of the merged dataframe to label for a supervised learning task

### 1. Library Importation

In [24]:
import pandas as pd
import numpy as np

### 2. Defining sampler

In [25]:
def sample_generator(filepath,pct_of_file= 0.25):
    
    """
    Return a sample of the file, where the sample size is determined by the pct_of_file specified
    
    Inputs:
        - filename (str): Path to file to be read
        - pct_of_file (float): Percentage of the original file to sample
        
    Output:
        - df (pandas Dataframe): Return the random sample
    
    """
    
    #Load a sample of the dataset
    #If randomly generated probability is greater than pct_of_file, the row will be skipped
    df = pd.read_csv(filepath, 
                     skiprows = lambda l: l>0 and np.random.random() > pct_of_file,
                     low_memory=False)
    
    print(f"Sampling complete. Sampled {len(df)} rows from {len(pd.read_csv(filepath))} rows")
    
    return df

### 3. Generate Sample From the Dataframe

In [26]:
sampled_df = sample_generator('../data/raw/merged.csv')

Sampling complete. Sampled 691 rows from 2701 rows


### 4. Save Sample Dataframe to CSV File
This CSV will be subsequently annotated

In [27]:
save_file_with_name = lambda name: sampled_df.to_csv(f"../data/raw/{name}.csv",index=False)

save_file_with_name('sample')

sampled_df[['Text']].rename(columns={'Text':'text'}).to_csv(f"../data/raw/sample_tweet_text.csv",
                                                            index=False)