In [38]:
import pandas as pd
import numpy as np

people = {
    "first":['Corey', 'Jane', 'John'],
    "last":['Schafer', 'Doe', 'Smith'],
    "email":['CoreySchafer@gmail.com','JaneDoe@hotmail.com','JohnSmith@outlook.com'],
}


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', None)  


pd.set_option('display.float_format', lambda x: '%.4f' % x)


survey_source = '/Users/josephyu/Documents/GitHub/data/survey_results_public.csv'
schema_source = '/Users/josephyu/Documents/GitHub/data/survey_results_schema.csv'


df = pd.read_csv(survey_source, index_col = 'Respondent')
schema_df = pd.read_csv(schema_source, index_col = 'Column')
pp_df = pd.DataFrame(people)

# ['Hobbyist', 'OpenSourcer', 'Region'] -> PERFECT Data Quality by column

In [3]:
df = pp_df

In [4]:
df = df.applymap(str.strip)

In [5]:
df['name'] = df['first'] + ' ' + df['last']

In [12]:
# 🧭 Append can even append to BOTH rows and columns all at once - the ultimate way of adding data as NoSQL style
# Append 可以同时以不定项形式，2维添加行列，非常适合不定项的NoSql模式
# NOTE: BUT 需要确保不任意添加不必要的（重复的）列！ E.g. first == first_name

# 🧠 df.append({dict}, ignore_index=True) -> ignore_index=True

df = df.append({
    'name': 'Adam Smith',
    'first': 'Joseph',
    'last': 'Yu',
    'email': 'JosephYu@gmail.com',
    }, ignore_index=True)

In [13]:
df

Unnamed: 0,first,last,email,name
0,Corey,Schafer,CoreySchafer@gmail.com,Corey Schafer
1,Jane,Doe,JaneDoe@hotmail.com,Jane Doe
2,John,Smith,JohnSmith@outlook.com,John Smith
3,Joseph,Yu,JosephYu@gmail.com,Adam Smith


In [14]:
# df.drop(index=)
# df.drop(columns=)

df.drop(index=2)

Unnamed: 0,first,last,email,name
0,Corey,Schafer,CoreySchafer@gmail.com,Corey Schafer
1,Jane,Doe,JaneDoe@hotmail.com,Jane Doe
3,Joseph,Yu,JosephYu@gmail.com,Adam Smith


In [33]:
filt = df['email'].str.contains('gmail.com')

df.drop(index=df[filt].index,
        columns=['name', 'email'])

Unnamed: 0,first,last
1,Jane,Doe
2,John,Smith


In [68]:
col_drop = df.isna().mean().sort_values(ascending=False)[:'WebFrameWorkedWith'].index

In [69]:
tgt_country = df['Country'].value_counts().nsmallest(100)[:'Azerbaijan'].index.tolist()

row_filt = df['Country'].isin(tgt_country)

row_drop = df[row_filt].index

In [70]:
df.shape

(88883, 84)

In [74]:
df.drop(
    index=row_drop,
    columns=col_drop,
).shape

(87745, 67)