# MoMa's collection data cleaning

## Problem solving
I'm working for the MoMa and they'd like to know in which department they need to enrich based on the current collection. 

**What is the Top-3 less valuable classification?**

In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
df=pd.read_csv('data/museum_modern_art.csv',sep=',')
df.head()

In [None]:
df.dtypes

## Renaming columns

In [None]:
df1=df.rename(columns={'Unnamed: 0':'Id'})

In [None]:
df1.columns

## Drop empty and useless tables

In [None]:
null_col=df1.isna().sum()
null_col_percent=round(null_col[null_col>0]/df1.shape[0]*100,2)
null_col_percent

In [None]:
drop_cols=null_col_percent[null_col_percent>50].index
df2=df1.drop(drop_cols,axis=1)

In [None]:
df2.shape

## Remove Duplicates

In [None]:
df3=df2.copy()
df3.iloc[:,:].duplicated().sum()

In [None]:
print(df3.iloc[:,:].shape)
df4=df3.iloc[:,:].drop_duplicates()
print(df4.iloc[:,:].shape)

## Put off parenthesis on text in relevant columns

In [None]:
parenthesis_col=['ArtistBio','Nationality','BeginDate','EndDate','Gender','Date']
parenthesis_col

df5=df4.copy()

In [None]:
for col in parenthesis_col:
    df5[col]=df5[col].str.replace('\(','').str.replace('\)','')
    
df5.head()

## Clean Date values

In [None]:
df6=df5.copy()
print(df6.Date.unique())
start_values=df6.Date.nunique()
print("total unique values in date: ",start_values)

In [None]:
df6.Date=df6.Date.astype(str)

In [None]:
def test(date):
    count_not_str=0
    if type(date) != str:
        count_not_str+=1
    return count_not_str

# Check if convert is working
count_type=df6.Date.apply(test).value_counts()
count_type

In [None]:
df6[df6.Date.str.contains('March 30')]

In [None]:
def clean_date(date):
    if re.search('[0-9]{4}$', date):
        return date[-4:]
    if re.search('^[0-9]{4}', date):
        return date[:4]
    elif re.search('[0-9]{4}', date):
        pos = re.search('[0-9]{4}', date).start()
        return date[pos:pos+4]
    elif re.search('[0-9]{3}\?', date):
        new_date = re.sub('\?','0',date)
        pos = re.search('[0-9]{4}', new_date).start()
        return new_date[pos:pos+4]
    elif re.search('^[a-zA-Z \,\?\.]+$', date):
        return np.nan
    elif re.search('century',date):
        return date[0]+str('00')
    else:
        return date
    
    
# Testing function
date='8th-9th century C.E.'
new_date=clean_date(date)
print(new_date)

In [None]:
df8=df6.copy()
before_clean2=df8.Date.nunique()
print("total unique values in date before clean 2: ", before_clean2)

df8.Date=df8.Date.apply(clean_date)

print(df8.Date.value_counts())
print(df8.Date.unique())
clean2_values=df8.Date.nunique()
print("total unique values in date after clean 2: ", clean2_values)

In [None]:
## I won't use it
df6.loc[:,'Date']=df6.loc[:,'Date'].str.replace("'",'').str.replace('.','').str.replace('early','').str.replace('s','').str.replace('c.','').str.replace('After','').str.replace('or before','').str.replace(' publihed','').str.replace('printed ','').str.replace('newpaperSeptember','').str.replace('exeted','').str.replace('Before','').str.replace(' ','')
print(df6.Date.unique())
clean1_values=df6.Date.nunique()
print("total unique values in date after clean 1: ", clean1_values)


In [None]:
## Manually cleaning inconsistent data
df9=df8.copy()
cel=df9[(df9.Date=='November 10')&(df9.Artist=='George Platt Lynes')]
df9.loc[cel.index,'Date']='1937'

In [None]:
## Manually cleaning inconsistent data
cel2=df9[(df9.Date=='newspaper published March 30')]
df9.loc[cel2.index,'Date']=np.nan
df9.loc[(df9.Artist=='Jan Knap')]

In [None]:
print(df9.Date.unique())
clean3_values=df9.Date.nunique()
print("total unique values in date after clean 3: ", clean3_values)

In [None]:
df10=df9.copy()
df10.convert_dtypes()
df10.dtypes

In [None]:
df10.max()