# Teacher Cleansing

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
expected_columns = ['teacher_id',
                    'alpha_index',
                    'batch_id',
                    'scrape_date',
                    'scrape_status',
                    'teacher_name',
                    'location',
                    'followers',
                    'languages',
                    'date_joined',
                    'about',
                    'image_url'
                    ]

In [None]:
#Datatypes dictionary
teacher_datatypes = {'teacher_id':'str',
                     'alpha_index':'str',
                     'batch_id':'int',
                     'scrape_date':'str', #will convert to datetime
                     'scrape_status':'str',
                     'teacher_name':'str',
                     'location':'str',
                     'followers':'str',
                     'langauges':'str',
                     'date_joined':'str',
                     'about':'str',
                     'image_url':'str'}

In [None]:
#For each batch data file in the teacher_batch_files subdirectory
batch_files_list = os.listdir('../data/teacher_batch_files')

batch_df_list = []

for batch_file in batch_files_list:
#    batch_df = pd.read_csv('../data/teacher_batch_files/' + batch_file, dtype = teacher_datatypes, index_col = 0)
    batch_df = pd.read_csv('../data/teacher_batch_files/' + batch_file, index_col = 0)    

    #Verify the batch has the correct columns in the correct order.
    correct_columns = True

    batch_columns = list(batch_df.columns)
    if len(batch_columns) == len(expected_columns):
        for col_position in range(0,len(batch_columns)):
            if batch_columns[col_position] != expected_columns[col_position]:
                correct_columns = False
    else:
        correct_columns = False
        
#    batch_df_list = batch_df_list.append(batch_df)
    if correct_columns:
        batch_df_list = batch_df_list + [batch_df]
    else:
        print('ERROR COLUMNS NOT AS EXPECTED',batch_file)

teachers_df = pd.concat(batch_df_list)

In [None]:
teachers_df.shape

### Cleansing rows: Page not found
- Remove duplicates
- Remove page not found
- Investigate name not found

In [None]:
#Show teachers whose page could not be found.
teachers_df.loc[teachers_df.scrape_status == 'page not found']

In [None]:
#Remove teachers whose page could not be found.
teachers_df = teachers_df.loc[teachers_df.scrape_status != 'page not found']

### Cleansing rows: Name not found

In [None]:
#Show teachers whose name could not be found. Try rerunning their batches to try to get their info again.
teachers_df.loc[teachers_df.scrape_status == 'name not found']

Rerun the batches with rows where name is not found.

In [None]:
#Remove teachers whose name could not be found.
teachers_df = teachers_df.loc[teachers_df.scrape_status != 'name not found']

### Cleansing rows: Duplicates

In [None]:
teachers_df.teacher_id.value_counts()

In [None]:
teachers_df = teachers_df.sort_values(by='teacher_id')

In [None]:
teachers_df = teachers_df.loc[~teachers_df[['teacher_id']].duplicated()]

### Null Date_Joined

In [None]:
#Show teachers with a null date_joined.
teachers_df.loc[teachers_df.date_joined.isnull()]

In [None]:
#Remove teachers whose date_joined is null.
teachers_df = teachers_df.loc[~teachers_df.date_joined.isnull()]

### Cleansing columns

In [None]:
for index, row in teachers_df.iterrows():

    #scrape_date

    #location
    if type(row.location) != type('x'):
        teachers_df.loc[index, 'location'] = 'Unknown'    
        
    #followers
    if type(row.followers) != type('x'):
        teachers_df.loc[index, 'followers'] = int(0)
    else:
        num_followers = row.followers[:-10] #remove ' followers'
        if num_followers[-1:] == 'k':
            num_followers = int(float(num_followers[:-1]) * 1000)
        else:
            num_followers = int(num_followers)
        teachers_df.loc[index, 'followers'] = num_followers
        
    #languages
    if type(row.languages) != type('x'):
        teachers_df.loc[index, 'languages'] = 'Unknown'
    
    #date_joined -- Note that rows with null date_joined have already been dropped.
    date_joined_year = row.date_joined[-4:]
    date_joined_month = row.date_joined[-8:-5]
    date_joined_day = '1'
    teachers_df.loc[index, 'date_joined'] = pd.to_datetime(date_joined_month + '/' + date_joined_day + '/' + date_joined_year,
                                                           format = '%b/%d/%Y')
    
    #about
    if type(row.about) != type('x'):
        teachers_df.loc[index, 'about'] = 'Unknown'

In [None]:
# Set correct data types
teachers_df['scrape_date'] = pd.to_datetime(teachers_df.scrape_date)
teachers_df['date_joined'] = pd.to_datetime(teachers_df.date_joined)
teachers_df = teachers_df.astype({'followers':'int'})

### Save Final Dataframe

In [None]:
teachers_df.to_csv('../data/teachers_df.csv')

In [None]:
teachers_df.info()

### Languages

In [None]:
teachers_df.languages.head(20)

In [None]:
#unemployment['Name'].str.split(',', expand = True)

#str('hello, this, is, my, deal').split(', ')
#str('hello this is my deal').split(', ')
#
#['hello, okay', 'meow'].split(', ')

In [None]:
teacher_languages_df = pd.DataFrame(columns=['teacher_id', 
                                             'language_native', 
                                             'language_english'])

In [None]:
teacher_id_list = []
language_native_list = []
language_english_list = []

for index, row in teachers_df.iterrows():
    lang_list = row.languages.split(' and ')
    if len(lang_list) == 2:
        lang_list_left = lang_list[0].split(', ')
        lang_list_right = [lang_list[1]]
        lang_list = lang_list_left + lang_list_right
    
    for language in lang_list:
        teacher_id_list = teacher_id_list + [row.teacher_id]
        language_native_list = language_native_list + [language]
        language_english_list = language_english_list + [language]

teacher_languages_dict = {'teacher_id':teacher_id_list,
                          'language_native':language_native_list,
                          'language_english':language_english_list}

teacher_languages_df = pd.DataFrame(teacher_languages_dict)

teacher_languages_df

In [None]:
teacher_languages_df.groupby(by=['language_native']).agg('count')

In [None]:
teachers_df.loc[teachers_df.languages == '普通話, 廣東話, English,']

In [None]:
languages_dict = {'English':'English',
                  'Unknown':'Unknown',
                  'Español': 'Spanish',
                  'BR. Português': 'Portuguese (Brazil)',
                  'Deutsch': 'German',
                  'Русский': 'Russian',
                  '日本語': 'Japanese',
                  'Tiếng Việt': 'Vietnamese',
                  'Polski': 'Polish',
                  'Italiano': 'Italian',
                  'اُردُو': 'Urdu',
                  'हिंदी': 'Hindi',
                  'ਗੁਰਮੁਖੀ':'Punjabi'
                 }