# MoMa's collection data cleaning

## Problem solving
I'm working for the MoMa and they'd like to know in which department they need to enrich based on the current collection. 

**What is the Top-3 less valuable classification?**

In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
df=pd.read_csv('data/museum_modern_art.csv',sep=',')
df.head()

In [None]:
df.dtypes

In [None]:
df.shape

## Renaming columns

In [None]:
df1=df.rename(columns={'Unnamed: 0':'Id'})

In [None]:
df1.columns

## Drop empty and useless tables

In [None]:
null_col=df1.isna().sum()
null_col_percent=round(null_col[null_col>0]/df1.shape[0]*100,2)
null_col_percent

In [None]:
drop_cols=null_col_percent[null_col_percent>50].index
df2=df1.drop(drop_cols,axis=1)

In [None]:
df2.shape

## Remove Duplicates

In [None]:
df3=df2.copy()
df3.iloc[:,:].duplicated().sum()

In [None]:
print(df3.iloc[:,:].shape)
df4=df3.iloc[:,:].drop_duplicates()
print(df4.iloc[:,:].shape)

## Put off parenthesis on text in relevant columns

In [None]:
parenthesis_col=['ArtistBio','Nationality','BeginDate','EndDate','Gender','Date']
parenthesis_col

df5=df4.copy()

In [None]:
for col in parenthesis_col:
    df5[col]=df5[col].str.replace('\(','').str.replace('\)','')
    
df5.head()

In [None]:
df5.iloc[:,:].duplicated().sum()

## Clean Date values

In [None]:
df6=df5.copy()
print(df6.Date.unique())
start_values=df6.Date.nunique()
print("total unique values in date: ",start_values)

In [None]:
df6.Date=df6.Date.astype(str)

In [None]:
def test(date):
    count_not_str=0
    if type(date) != str:
        count_not_str+=1
    return count_not_str

# Check if convert is working
count_type=df6.Date.apply(test).value_counts()
count_type

In [None]:
def clean_date(date):
    if re.search('[0-9]{4}$', date):
        return date[-4:]
    if re.search('^[0-9]{4}', date):
        return date[:4]
    elif re.search('[0-9]{4}', date):
        pos = re.search('[0-9]{4}', date).start()
        return date[pos:pos+4]
    elif re.search('[0-9]{3}\?', date):
        new_date = re.sub('\?','0',date)
        pos = re.search('[0-9]{4}', new_date).start()
        return new_date[pos:pos+4]
    elif re.search('^[a-zA-Z \,\?\.]+$', date):
        return np.nan
    elif re.search('century',date):
        return date[0]+str('00')
    else:
        return date
    
    
# Testing function
date='8th-9th century C.E.'
new_date=clean_date(date)
print(new_date)

In [None]:
df8=df6.copy()
before_clean2=df8.Date.nunique()
print("total unique values in date before clean 2: ", before_clean2)

df8.Date=df8.Date.apply(clean_date)

print(df8.Date.value_counts())
print(df8.Date.unique())
clean2_values=df8.Date.nunique()
print("total unique values in date after clean 2: ", clean2_values)

In [None]:
## I won't use it - 1st way used to clean date values
df6.loc[:,'Date']=df6.loc[:,'Date'].str.replace("'",'').str.replace('.','').str.replace('early','').str.replace('s','').str.replace('c.','').str.replace('After','').str.replace('or before','').str.replace(' publihed','').str.replace('printed ','').str.replace('newpaperSeptember','').str.replace('exeted','').str.replace('Before','').str.replace(' ','')
print(df6.Date.unique())
clean1_values=df6.Date.nunique()
print("total unique values in date after clean 1: ", clean1_values)


In [None]:
## Manually cleaning inconsistent data
df9=df8.copy()
cel=df9[(df9.Date=='November 10')&(df9.Artist=='George Platt Lynes')]
df9.loc[cel.index,'Date']='1937'

In [None]:
## Manually cleaning inconsistent data
cel2=df9[(df9.Date=='newspaper published March 30')]
df9.loc[cel2.index,'Date']=np.nan
df9.loc[(df9.Artist=='Jan Knap')]

In [None]:
print(df9.Date.unique())
clean3_values=df9.Date.nunique()
print("total unique values in date after clean 3: ", clean3_values)

## Guess Missing Date Values

In [None]:
df9.Date.isna().sum()

In [None]:
# Drop row with unknown artist and unknown date
drop_row=df9[(df9.ConstituentID.isna())&(df9.Date.isna())].index
df9_bis=df9.copy()
df9_bis.drop(drop_row,axis=0,inplace=True)

In [None]:
df9_bis.Date.isna().sum()
df9_bis[['ConstituentID','Date']]

In [None]:
# Find list of artist with only nan dates
total_per_artist = df9_bis.ConstituentID.value_counts()
total_per_artist

number_nan_per_artist = df9_bis[df9_bis.Date.isna()].ConstituentID.value_counts()

list_artist_nan_date=[]

for a in number_nan_per_artist.iteritems():
    artist = a[0]
    number_of_nan = a[1]
    if total_per_artist.loc[artist] == number_of_nan:
        list_artist_nan_date.append(artist)

len(list_artist_nan_date)

In [None]:
# Keep rows excluding list of artist with only nan dates
df9_bis=df9_bis[-df9_bis.ConstituentID.isin(list_artist_nan_date)]

In [None]:
df10=df9_bis.copy()

# create a copy of dataframe and delete nan date to calculate the mean date of every artist
null_date=df10.loc[df10.Date.isna()].index
df10.drop(null_date,axis=0,inplace=True)

In [None]:
# convert date as int to work with mean
df10.Date=df10.Date.astype(int)

In [None]:
# Calculate mean for every artist
mean_date=round(df10.groupby('ConstituentID')['Date'].agg('mean'))
mean_date=mean_date.astype(int)
mean_date[mean_date.index=='27'][0]
mean_date

In [None]:
mean_date.loc['9971']

In [None]:
# Function to return the mean of the constituentID
def getmean(x):
    # x string
    if x in mean_date.index:
        return mean_date.loc[x]
    return

# Testing the function
test_tab=df9[['ConstituentID','Date']]
test_date=test_tab[test_tab.ConstituentID=='27']

test_date.Date=test_date.Date.fillna(test_date['ConstituentID'].apply(getmean))
test_date

In [None]:
# Testing 2nd times with several artist
test_date_bis=test_tab[test_tab.ConstituentID.isin(['27','4930'])]

test_date_bis.Date=test_date_bis.Date.fillna(test_date_bis['ConstituentID'].apply(getmean))
test_date_bis

In [None]:
df11=df9_bis.copy()

df11.Date=df11.Date.fillna(df11['ConstituentID'].apply(getmean))

## Create Bins

In [None]:
df11.Date=df11.Date.astype(int)
df11[df11.Date==700]

In [None]:
labels=["690-1850"]
cutoffs=[690]
         
for i in range(1850,2020,10):
    cutoffs.append(i)
    labels.append(str(i+1)+"-"+str(i+10))

cutoffs.append(2020)
labels

In [None]:
df11['DateRange']=pd.cut(df11.Date, cutoffs,labels=labels)

## Create histogram

In [None]:
## 1st way to get the dataframe
bar=df11.groupby('DateRange')['Id'].agg('count').reset_index()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set()
fig,ax=plt.subplots(figsize=(20,8))
barchart=sns.barplot(data=df99,  x='DateRange',y='Count')