# Preliminary Data exploration

In this notebook I remove and add columns in order to prepare the data for the general linear model.

Import dataset with all metadata and preprocessing.

In [2]:
import pandas as pd

df = pd.read_csv("../Materials/KVO_META.csv", encoding='latin-1', index_col=False)


In [3]:
#remove trailin and leading whitespaces.
for column in df.columns:
    try:
        df[column] = df[column].str.strip()
    except AttributeError:
        continue

In [39]:
#Only keep the relevant columns
df = df[['Date', 'Original premiere', 'Directors' ,'Normalised title', 'Composer', 'Occasion', 'Original language', 'Multiple bill', "Season"]]
df.sample(5)

Unnamed: 0,Date,Original premiere,Directors,Normalised title,Composer,Occasion,Original language,Multiple bill,Multiple bill.1,Season
15,1893-11-23,1821,Edward Keurvels & Henry Fontaine,Preciosa,Carl Maria von Weber,,DUI,False,False,1893-1894
3221,01/01/1928,1905,Fé Derickx & Bernard Tokkie,SalomÃ©,Richard Strauss,,DUI,True,True,1927-1928
1235,13/01/1909,1874,Jef Judels & Bernard Tokkie,De Walkure,Richard Wagner,feestavond ten voordele van den nationale belg...,DUI,False,False,1908-1909
1498,31/01/1911,1894,Henry Fontaine,Cleopatra,August Enna,,DEE,False,False,1910-1911
38,1894-02-23,?,Edward Keurvels & Henry Fontaine,Mellusina,Emiel Wambach,eerste vertoning,NL,False,False,1893-1894


In [40]:
#Create a new column in which we keep track of: 
#   the Gala performances (G)
#   the popular peformances (P)
#   all others (N)

df['Occasion'] = df['Occasion'].str.lower()
df['OCC'] = df['Occasion'].astype(str).apply(lambda x: 'G' if 'gala' in x else 'P' if 'volk' in x.lower() else 'N')
df['OCC'].value_counts()

N    3940
G     461
P      40
Name: OCC, dtype: int64

In [42]:
df['Language'] = df['Original language'].str.lower()
df['Language'] = df['Original language'].astype(str).apply(lambda x: 'German' if 'DUI' in x else 'Dutch' if 'NL' in x else 'other')
df['Language'].value_counts()

German    1950
other     1586
Dutch      905
Name: Language, dtype: int64

In [44]:
#Create boolean column for original Dutch performances (True) and others (False)
df['Dutch'] = df['Original language'].astype(str).apply(lambda x: 'True' if 'NL' in x else 'False')
df['German'] = df['Original language'].astype(str).apply(lambda x: 'True' if 'DUI' in x else 'False')
print(df['Dutch'].value_counts())
print(df['German'].value_counts())

False    3536
True      905
Name: Dutch, dtype: int64
False    2491
True     1950
Name: German, dtype: int64


In [45]:
#Drop irrelevant columns, and rename occasion column.
df.drop(['Occasion', 'Original language'], axis='columns', inplace=True)
df.rename(columns={"OCC": "Occasion"}, inplace=True)
df.head()

Unnamed: 0,Date,Original premiere,Directors,Normalised title,Composer,Multiple bill,Multiple bill.1,Season,Occasion,Language,Dutch,German
0,1893-10-03,1821,Edward Keurvels & Henry Fontaine,De Vrijschutter,Carl Maria von Weber,False,False,1893-1894,N,German,False,True
1,1893-10-05,1971,Edward Keurvels & Henry Fontaine,Willem Tell,Carl Reinecke,False,False,1893-1894,N,German,False,True
2,1893-10-10,1821,Edward Keurvels & Henry Fontaine,De Vrijschutter,Carl Maria von Weber,False,False,1893-1894,N,German,False,True
3,1893-10-12,1971,Edward Keurvels & Henry Fontaine,Willem Tell,Carl Reinecke,False,False,1893-1894,N,German,False,True
4,1893-11-16,?,Edward Keurvels & Henry Fontaine,Charlotte Corday,Peter Benoit,False,False,1893-1894,N,Dutch,True,False


In [None]:
#df.to_csv('Binomial_prep.csv', encoding = 'utf-8-sig')