In [1]:
%matplotlib inline

# Predicting Religion from Country Flags 

This notebook uses the UCI Machine Learning Repository _flags_ dataset to predict the religion of a country based on the attributes of their flags. 

ðŸ‡¦ðŸ‡«ðŸ‡¦ðŸ‡½ðŸ‡¦ðŸ‡±ðŸ‡©ðŸ‡¿ðŸ‡¦ðŸ‡¸ðŸ‡¦ðŸ‡©ðŸ‡¦ðŸ‡´ðŸ‡¦ðŸ‡®ðŸ‡¦ðŸ‡¶ðŸ‡¦ðŸ‡¬ðŸ‡¦ðŸ‡·ðŸ‡¦ðŸ‡²ðŸ‡¦ðŸ‡¼ðŸ‡¦ðŸ‡ºðŸ‡¦ðŸ‡¹ðŸ‡¦ðŸ‡¿ðŸ‡§ðŸ‡¸ðŸ‡§ðŸ‡­ðŸ‡§ðŸ‡©ðŸ‡§ðŸ‡§ðŸ‡§ðŸ‡¾ðŸ‡§ðŸ‡ªðŸ‡§ðŸ‡¿ðŸ‡§ðŸ‡¯ðŸ‡§ðŸ‡²ðŸ‡§ðŸ‡¹ðŸ‡§ðŸ‡´ðŸ‡§ðŸ‡¶ðŸ‡§ðŸ‡¦ðŸ‡§ðŸ‡¼ðŸ‡§ðŸ‡·ðŸ‡®ðŸ‡´

In [2]:
import os 

import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt 

In [3]:
# Load data and do some simple data management 

DATA  = "flags_missingvals.data"
FEATS = [
    "name", "landmass", "zone", "area", "population", "language", "religion", "bars", 
    "stripes", "colours", "red", "green", "blue", "gold", "white", "black", "orange", 
    "mainhue", "circles", "crosses", "saltires", "quarters", "sunstars", "crescent", 
    "triangle", "icon", "animate", "text", "topleft", "botright",
]

landmass = {
    1.0: 'N.America', 2.0: 'S.America', 3.0: 'Europe', 4.0: 'Africa', 5.0: 'Asia', 6.0: 'Oceania'
}

zone = {
    1.0: 'NE', 2.0: 'SE', 3.0: 'SW', 4.0: 'NW'
}

language = {
    1.0: 'English', 2.0: 'Spanish', 3.0: 'French', 4.0: 'German', 5.0: 'Slavic', 6.0: 'Other, Indo-European', 
    7.0: 'Chinese', 8.0: 'Arabic', 9.0: 'Japanese/Turkish/Finnish/Magyar', 10.0: 'Others'
}

religion = {
    0.0: 'Catholic', 1.0: 'Other Christian', 2.0: 'Muslim', 3.0: 'Buddhist', 4.0: 'Hindu',
    5.0: 'Ethnic', 6.0: 'Marxist', 7.0: 'Others'
}

# Load Data 
df = pd.read_csv(DATA, header=None, names=FEATS)

# Convert numeric symbols to strings 
df.landmass = df.landmass.apply(lambda k: landmass.get(k, ''))
df.zone = df.zone.apply(lambda k: zone.get(k, ''))
df.language = df.language.apply(lambda k: zone.get(k, ''))
df.religion = df.religion.apply(lambda k: religion.get(k, ''))
df.mainhue = df.mainhue.apply(str)
df.topleft = df.topleft.apply(str)
df.botright = df.botright.apply(str)
df.head()

Unnamed: 0,name,landmass,zone,area,population,language,religion,bars,stripes,colours,...,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
0,Afghanistan,Asia,NE,648.0,16.0,,Muslim,0.0,3,5.0,...,0.0,0,1.0,0,0.0,1.0,0,0,black,green
1,Albania,Europe,NE,29.0,3.0,,Marxist,0.0,0,3.0,...,0.0,0,1.0,0,0.0,0.0,1,0,red,red
2,Algeria,Africa,NE,2388.0,20.0,,Muslim,2.0,0,3.0,...,0.0,0,1.0,1,0.0,0.0,0,0,green,white
3,American-Samoa,Oceania,SW,0.0,0.0,NE,Other Christian,0.0,0,5.0,...,0.0,0,0.0,0,1.0,1.0,1,0,blue,red
4,Andorra,Europe,NE,0.0,0.0,,Catholic,3.0,0,3.0,...,0.0,0,0.0,0,0.0,0.0,0,0,blue,red


In [4]:
df.describe()

Unnamed: 0,area,population,bars,stripes,colours,red,green,blue,gold,white,...,circles,crosses,saltires,quarters,sunstars,crescent,triangle,icon,animate,text
count,190.0,192.0,192.0,194.0,193.0,193.0,193.0,194.0,191.0,193.0,...,193.0,193.0,193.0,194.0,192.0,194.0,193.0,193.0,194.0,194.0
mean,700.763158,22.645833,0.458333,1.551546,3.466321,0.787565,0.471503,0.510309,0.47644,0.751295,...,0.170984,0.150259,0.093264,0.149485,1.375,0.056701,0.139896,0.253886,0.201031,0.082474
std,2192.728281,92.140114,1.042718,2.328005,1.303103,0.410095,0.500486,0.501187,0.500757,0.433386,...,0.464116,0.386238,0.291559,0.43586,4.410245,0.231869,0.347782,0.436365,0.401808,0.275798
min,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,110.0,3.5,0.0,0.0,3.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,449.25,14.0,0.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
max,22402.0,1008.0,5.0,14.0,8.0,1.0,1.0,1.0,1.0,1.0,...,4.0,2.0,1.0,4.0,50.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# Custom transformer for multi-column label encoding 
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin 


class EncodeCategorical(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns=[], missing_value=''):
        self.columns  = list(columns)
        self.encoders = None
        
    def fit(self, X, y=None):
        """
        Creates a label encoder for each column specified. 
        """
        if not self.columns:
            self.columns = X.columns 
        
        self.encoders = {
            column:  LabelEncoder().fit(X[column])
            for column in self.columns
        }
        
        return self
    
    def transform(self, X):
        output = X.copy()
        for col, encoder in self.encoders.items():
            output[col] = encoder.transform(output[col])
        
        return output

In [6]:
class Pandas2Matrix(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        return X.as_matrix().astype('float64')

In [7]:
feature_names = [
    "landmass", "zone", "area", "population", "language", "bars", 
    "stripes", "colours", "red", "green", "blue", "gold", "white", "black", "orange", 
    "mainhue", "circles", "crosses", "saltires", "quarters", "sunstars", "crescent", 
    "triangle", "icon", "animate", "text", "topleft", "botright",
]

X = df[feature_names]
y = df.religion

In [8]:
from sklearn.preprocessing import LabelEncoder

# Target Label Encoder 
target_encoder = LabelEncoder() 
y = target_encoder.fit_transform(y)

In [9]:
from sklearn.preprocessing import Imputer 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestClassifier


pipeline = Pipeline([
        ('feature_encoder', EncodeCategorical(['landmass', 'zone', 'language', 'mainhue', 'topleft', 'botright'])), 
        ('pd2np', Pandas2Matrix()), 
        ('imputer', Imputer(missing_values=np.nan, strategy='most_frequent')),
        ('classifier', RandomForestClassifier()),
    ])

In [11]:
from sklearn.metrics import classification_report 
from sklearn.cross_validation import train_test_split as tts 

# Create training and test splits 
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1)

# Fit on the training data 
pipeline.fit_transform(X_train, y_train)

# Evaluate on the test data 
y_hat = pipeline.predict(X_test)


# Convert y back to strings 
y_test = target_encoder.inverse_transform(y_test)
y_hat  = target_encoder.inverse_transform(y_hat)

print(classification_report(y_hat, y_test))

             precision    recall  f1-score   support

   Buddhist       0.00      0.00      0.00         0
   Catholic       1.00      0.43      0.60         7
     Ethnic       0.75      0.75      0.75         4
    Marxist       0.50      1.00      0.67         1
     Muslim       0.60      0.43      0.50         7
Other Christian       0.33      1.00      0.50         1
     Others       0.00      0.00      0.00         0

avg / total       0.75      0.55      0.59        20



  'recall', 'true', average, warn_for)
