In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
file_path = 'data/UFRGS/'
df = pd.read_csv(file_path + 'data.csv')
df.head()

Unnamed: 0,gender,physics,biology,history,SecondLanguage,geography,literature,PortugueseEssay,math,chemistry,GPA
0,0,622.6,491.56,439.93,707.64,663.65,557.09,711.37,731.31,509.8,1.33333
1,1,538.0,490.58,406.59,529.05,532.28,447.23,527.58,379.14,488.64,2.98333
2,1,455.18,440.0,570.86,417.54,453.53,425.87,475.63,476.11,407.15,1.97333
3,0,756.91,679.62,531.28,583.63,534.42,521.4,592.41,783.76,588.26,2.53333
4,1,584.54,649.84,637.43,609.06,670.46,515.38,572.52,581.25,529.04,1.58667


In [16]:
df['gender'].value_counts()

1    22335
0    20968
Name: gender, dtype: int64

In [17]:
gpa_threshold = 3.3
df['GPA_bi'] = df['GPA'].apply(lambda x: int(x >= gpa_threshold))
print(df[['GPA_bi', 'gender', 'math']].groupby(by=['gender', 'GPA_bi']).count())

                math
gender GPA_bi       
0      0       12607
       1        8361
1      0       16914
       1        5421


In [18]:
df.shape

(43303, 12)

## Downsample population 

In [19]:
ma_df = df.query('gender==1')
mi_df = df.query('gender==0').copy()
sample_ratio = 0.3


In [20]:
np.random.seed(1)
sampled_indices = np.random.rand(len(mi_df)) < sample_ratio

sampled_mi = mi_df.loc[sampled_indices]
sampled_mi.shape[0]

6296

In [21]:
sample_df = pd.concat([ma_df, sampled_mi])
sample_df.reset_index(inplace=True)
sample_df.shape[0]

28631

In [22]:
sample_df['gender'].value_counts()

1    22335
0     6296
Name: gender, dtype: int64

In [23]:
print(sample_df[['GPA_bi', 'gender', 'math']].groupby(by=['gender', 'GPA_bi']).count())

                math
gender GPA_bi       
0      0        3782
       1        2514
1      0       16914
       1        5421


In [24]:
sample_df.drop(columns=['index', 'GPA'], inplace=True)
sample_df.rename(columns={'GPA_bi': 'GPA'}, inplace=True)
sample_df

Unnamed: 0,gender,physics,biology,history,SecondLanguage,geography,literature,PortugueseEssay,math,chemistry,GPA
0,1,538.00,490.58,406.59,529.05,532.28,447.23,527.58,379.14,488.64,0
1,1,455.18,440.00,570.86,417.54,453.53,425.87,475.63,476.11,407.15,0
2,1,584.54,649.84,637.43,609.06,670.46,515.38,572.52,581.25,529.04,0
3,1,325.99,466.74,597.06,554.43,535.77,717.03,477.60,503.82,422.92,0
4,1,527.65,559.99,758.37,669.71,645.62,648.67,539.23,470.78,486.13,0
...,...,...,...,...,...,...,...,...,...,...,...
28626,0,482.47,534.23,600.00,499.43,487.11,622.29,595.25,405.98,507.95,0
28627,0,565.25,610.70,683.67,657.02,582.79,673.57,592.41,495.40,566.54,0
28628,0,646.34,682.52,598.85,645.05,582.52,509.21,550.95,491.42,604.94,0
28629,0,586.54,541.77,574.82,657.02,534.42,546.76,574.01,416.76,371.09,0


In [25]:
sample_df.to_csv('data/UFRGS/data_samples.csv', index=False)

In [3]:
down_popu_df = pd.read_csv('data/UFRGS/data_samples.csv')
print(down_popu_df[['GPA', 'gender', 'math']].groupby(by=['gender', 'GPA']).count())

             math
gender GPA       
0      0    12607
       1     1262
1      0    16914
       1     5421


In [5]:
down_pos_df = pd.read_csv('data/UFRGS/data_samples_g0.csv')
print(down_pos_df[['GPA', 'gender', 'math']].groupby(by=['gender', 'GPA']).count())

             math
gender GPA       
0      0     3782
       1     2514
1      0    16914
       1     5421


## Downsample positive label

In [23]:
ma_df = df.query('gender==1')
mi_neg_df = df.query('gender==0 and GPA_bi==0')
mi_pos = df.query('gender==0 and GPA_bi==1').copy()
orig_df = pd.concat([ma_df, mi_neg_df])
orig_df.shape

(34942, 12)

In [30]:
sample_ratio = 0.1 * mi_neg_df.shape[0]/mi_pos.shape[0]
sample_ratio

0.1507833991149384

In [31]:
np.random.seed(1)
sampled_indices = np.random.rand(len(mi_pos)) < sample_ratio

sampled_pos = mi_pos.loc[sampled_indices]
sampled_pos.shape[0]

1262

In [33]:
sampled_pos

Unnamed: 0,gender,physics,biology,history,SecondLanguage,geography,literature,PortugueseEssay,math,chemistry,GPA,GPA_bi
19,0,714.51,760.38,660.90,521.62,725.30,687.42,570.19,719.01,753.44,3.77667,1
33,0,541.21,594.57,567.04,521.62,423.71,493.88,580.82,581.25,484.16,3.30000,1
35,0,558.19,697.72,469.68,625.22,672.07,692.05,589.81,567.34,611.12,3.41333,1
84,0,505.06,482.71,468.02,456.85,460.69,648.67,461.30,535.57,529.77,3.73000,1
102,0,525.97,662.49,537.93,722.57,513.00,549.81,754.43,534.14,583.29,3.46333,1
...,...,...,...,...,...,...,...,...,...,...,...,...
43158,0,664.07,655.35,563.73,554.58,481.27,664.57,579.14,661.89,703.59,3.40000,1
43228,0,628.92,541.03,496.09,602.75,556.57,551.71,569.65,643.31,655.45,3.79667,1
43230,0,749.03,789.19,723.29,650.16,712.93,858.78,640.17,810.15,780.28,3.71000,1
43251,0,522.65,518.79,553.05,583.63,582.79,673.57,514.72,390.54,436.24,3.30000,1


In [47]:
sample_df = pd.concat([orig_df, sampled_pos])
sample_df.reset_index(inplace=True)
sample_df.shape[0]

36204

In [48]:
print(sample_df[['GPA_bi', 'gender', 'math']].groupby(by=['gender', 'GPA_bi']).count())

                math
gender GPA_bi       
0      0       12607
       1        1262
1      0       16914
       1        5421


In [49]:
sample_df.drop(columns=['index', 'GPA'], inplace=True)
sample_df.rename(columns={'GPA_bi': 'GPA'}, inplace=True)
sample_df

Unnamed: 0,gender,physics,biology,history,SecondLanguage,geography,literature,PortugueseEssay,math,chemistry,GPA
0,1,538.00,490.58,406.59,529.05,532.28,447.23,527.58,379.14,488.64,0
1,1,455.18,440.00,570.86,417.54,453.53,425.87,475.63,476.11,407.15,0
2,1,584.54,649.84,637.43,609.06,670.46,515.38,572.52,581.25,529.04,0
3,1,325.99,466.74,597.06,554.43,535.77,717.03,477.60,503.82,422.92,0
4,1,527.65,559.99,758.37,669.71,645.62,648.67,539.23,470.78,486.13,0
...,...,...,...,...,...,...,...,...,...,...,...
36199,0,664.07,655.35,563.73,554.58,481.27,664.57,579.14,661.89,703.59,1
36200,0,628.92,541.03,496.09,602.75,556.57,551.71,569.65,643.31,655.45,1
36201,0,749.03,789.19,723.29,650.16,712.93,858.78,640.17,810.15,780.28,1
36202,0,522.65,518.79,553.05,583.63,582.79,673.57,514.72,390.54,436.24,1


In [50]:
sample_df.to_csv('data/UFRGS/data_samples.csv', index=False)