In [2]:
import pandas as pd
import numpy as np
import re


In [3]:
df = pd.read_csv("Survey concerning GPA.csv")
df_ori = pd.read_csv("Survey concerning GPA.csv")


In [4]:
# Renaming columns (making them more readable)

df.columns = ['CGPA during in person studies',
              'CGPA during online studies',
              'in-person or an online environment',
              'GPA change']

In [5]:
df.iloc[[3,7,8,14,15,24,38]]


Unnamed: 0,CGPA during in person studies,CGPA during online studies,in-person or an online environment,GPA change
3,2,2+,Online,0.5
7,,,Online,0.3
8,4.15,3.75,In-person,Do the math lol
14,around 2.5,above 2.5,Online,0.3
15,3.23,2.9,In-person,0.33
24,297,267,In-person,037
38,2.88,not sure but it was 2.5+,In-person,increase of around 0.38 points from online to ...


In [6]:
# Despite having clear instructions, users entered unwanted symbols
# like "+" and "-" and
# users have entered non numeric values as well
# the following functions identify these values using regex and promptly
# removes them

# function which removes "+" and "-"
def symbolStrip(s):
    
    pattern = r"[\+\-]"
    s = re.sub(pattern, "", s)
    return s

# function which converts missing values (nan) to 0
def nanToZero(s):
    
    pattern = r"nan"
    s = re.sub(pattern, "0", s)
    return s

# function which removes all non numeric values     
def nonNumericStrip(s):
    
    pattern = r"[a-z\sA-Z]"
    s = re.sub(pattern, "", s)
    return s

# function which converts commas to  decimal points
def commaToPeriod(s):
    
    pattern = r"[\,]"
    s = re.sub(pattern, ".", s)
    return s

In [7]:
# for ease of cleaning, converting the entire dataframe to type string

df = df.astype(str)

# applying functions to the columns which require cleaning

df['CGPA during in person studies']=(df['CGPA during in person studies']
                                       .apply(symbolStrip)
                                       .apply(nanToZero)
                                       .apply(nonNumericStrip)
                                       .apply(commaToPeriod))

df['CGPA during online studies'] = (df['CGPA during online studies']
                                       .apply(symbolStrip)
                                       .apply(nanToZero)
                                       .apply(nonNumericStrip)
                                       .apply(commaToPeriod))

df['GPA change'] = (df['GPA change']
                        .apply(symbolStrip)
                        .apply(nanToZero)
                        .apply(nonNumericStrip)
                        .apply(commaToPeriod))

In [8]:
# converting datatype back to a numerical

df["CGPA during in person studies"]=(df["CGPA during in person studies"]
                                            .astype(float))

df["CGPA during online studies"] = (df["CGPA during online studies"]
                                        .astype(float))

# filtering out invalid data (Deletes entire row
# if the entries are larger than 4.5)

df = df.drop(df[(df['CGPA during in person studies'] > 4.5)].index)
df = df.drop(df[(df['CGPA during online studies'] > 4.5)].index)
        



In [9]:
# our survey included the ability to only enter the difference of GPA if said person was
# not comfortable with sharing their GPA

# The code below checks if the user did not enter their GPA and creates a new dataframe
# which holds only entries where the participents did not enter their CGPA

data1 = list(df['CGPA during in person studies'])
data2 = list(df['CGPA during online studies'])
ls = []

for i in range(df.shape[0]):
    if data1[i] == 0 and data2[i] == 0:
        ls.append(i)
    

anonEntryDf = df.iloc[ls].copy()
anonEntryDf

Unnamed: 0,CGPA during in person studies,CGPA during online studies,in-person or an online environment,GPA change
7,0.0,0.0,Online,0.3
30,0.0,0.0,In-person,0.29
31,0.0,0.0,In-person,0.22
43,0.0,0.0,In-person,0.28
47,0.0,0.0,In-person,0.3


In [10]:
# converting GPA change to float

anonEntryDf["GPA change"] = (anonEntryDf["GPA change"]
                             .astype(float))
ls_ = []

# the following code checks if the participent did better online or inperson
# if they did better online the difference will be negative
# whereas if they did better in person the difference will be positive

for i in list(anonEntryDf['in-person or an online environment']):
    if i == "Online":
        ls_.append(-1)
            
    else:
        ls_.append(1)
        
anonEntryDf["GPA difference (in-person - online)"] = (np.array(anonEntryDf["GPA change"])
                                                      * np.array(ls_))

In [11]:
anonEntryDf[['GPA difference (in-person - online)']]

Unnamed: 0,GPA difference (in-person - online)
7,-0.3
30,0.29
31,0.22
43,0.28
47,0.3


In [12]:
# finding GPA difference by subtracting online study entries from inperson study entries 

df["GPA difference (in-person - online)"] = (np.array(df["CGPA during in person studies"])
                                             - np.array(df["CGPA during online studies"]))

In [13]:
# merging our dataframe with our anonmous entries dataframe

df.iloc[ls] = anonEntryDf

In [16]:
col_data = list(df['GPA difference (in-person - online)'])
data_30 = []
data_20 = []
data_15 = []

for i in (col_data):
    if i >= 0.30:
        data_30.append(i)
    if i >= 0.20:
        data_20.append(i)
        
    if i >= 0.10:
        data_15.append(i)
 

print("number of entries")

print(len(data_30))
print(len(data_20))
print(len(data_15))


        

number of entries
14
22
29


In [21]:
df
df.to_csv('cleanData.csv')