### Data: https://archive.ics.uci.edu/ml/datasets/Flags

In [1]:
import pandas as pd
import sqlalchemy as db

In [2]:
engine = db.create_engine('sqlite:///flags_data.db')
connection = engine.connect()
metadata = db.MetaData()
flags = db.Table('flags', metadata, autoload=True, autoload_with=engine)

In [3]:
query = db.select([flags])
results = connection.execute(query).fetchall()
pd.DataFrame(results, columns=results[0].keys()).head(5)

Unnamed: 0,name,landmass,zone,area,population,language,religion,bars,stripes,colors,...,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
0,Afghanistan,5,1,648,16,10,2,0,3,5,...,0,0,1,0,0,1,0,0,black,green
1,Albania,3,1,29,3,6,6,0,0,3,...,0,0,1,0,0,0,1,0,red,red
2,Algeria,4,1,2388,20,8,2,2,0,3,...,0,0,1,1,0,0,0,0,green,white
3,American-Samoa,6,3,0,0,1,1,0,0,5,...,0,0,0,0,1,1,1,0,blue,red
4,Andorra,3,1,0,0,6,0,3,0,3,...,0,0,0,0,0,0,0,0,blue,red


In [4]:
len(results)

194

In [5]:
# writing a small function which filters a boolean column, sorts counties in ascending order
def boolean_filter_db(filter_column, country_column):
    filter_rows = db.select([flags.columns.name]).where(filter_column == 1).order_by(db.asc(country_column))
    results = connection.execute(filter_rows).fetchall()
    return pd.DataFrame(results, columns=results[0].keys())

In [6]:
# get all the countries with language English, sorted by countries in ascending order
english_language = boolean_filter_db(flags.columns.language, flags.columns.name)
print "There are {} countries with lanuguage english".format(len(english_language))

There are 43 countries with lanuguage english


In [7]:
# use the same functionality for different colors which are red, green, blue, gold, white, black, orange
colors_list = ['red', 'green', 'blue', 'gold', 'white', 'black', 'orange']
colors_column_names = [flags.columns.red, flags.columns.green, flags.columns.blue, 
                       flags.columns.gold, flags.columns.white, flags.columns.black, flags.columns.orange]
for idx, clr in enumerate(colors_column_names):
    color_pick = boolean_filter_db(clr, flags.columns.name)
    note = "There are {} countries with {} color in flags"
    print note.format(len(color_pick), colors_list[idx])

There are 153 countries with red color in flags
There are 91 countries with green color in flags
There are 99 countries with blue color in flags
There are 91 countries with gold color in flags
There are 146 countries with white color in flags
There are 52 countries with black color in flags
There are 26 countries with orange color in flags


In [8]:
# use the same functionality for different shapes, text which are crescent, triangle, icon, animate, text
shapes_list = ['crescent', 'triangle', 'icon', 'animate', 'text']
shapes_column_names = [flags.columns.crescent, flags.columns.triangle, flags.columns.icon, 
                       flags.columns.animate, flags.columns.text]
for idx, shp in enumerate(shapes_column_names):
    shape_pick = boolean_filter_db(shp, flags.columns.name)
    note = "There are {} countries with {} in flags"
    print note.format(len(shape_pick), shapes_list[idx])

There are 11 countries with crescent in flags
There are 27 countries with triangle in flags
There are 49 countries with icon in flags
There are 39 countries with animate in flags
There are 16 countries with text in flags


In [9]:
# writing a small function which takes in a column, sorts it in descending order excluding zeros 
# and returns the country along with it
def custom_filter_db(filter_column, country_column):
    filter_query = db.select([country_column, filter_column]).where(filter_column > 0)
    filter_query = filter_query.order_by(db.desc(filter_column))
    results = connection.execute(filter_query).fetchall()
    return pd.DataFrame(results, columns=results[0].keys())

In [10]:
# get all the countries flags with most no. of vertical bars, sorted by vertical bars
flags_with_bars = custom_filter_db(flags.columns.bars, flags.columns.name)
note = "There are {} countries with bars in flags, among which {} has the most number of bars: {}"
print note.format(len(flags_with_bars), flags_with_bars['name'][0], flags_with_bars['bars'][0])

There are 35 countries with bars in flags, among which St-Vincent has the most number of bars: 5


In [11]:
# get all the countries flags with most no. of horizontal stripes, sorted by horizontal stripes
flags_with_stripes = custom_filter_db(flags.columns.stripes, flags.columns.name)
note = "There are {} countries with stripes in flags, among which {} has the most number of stripes: {}"
print note.format(len(flags_with_stripes), flags_with_stripes['name'][0], flags_with_stripes['stripes'][0])

There are 84 countries with stripes in flags, among which Malaysia has the most number of stripes: 14


In [12]:
# get all the countries flags with most no. of different colors in the flag, sorted by different colors in the flag
different_colors = custom_filter_db(flags.columns.colors, flags.columns.name)
note = "There are {} countries with different colors in flags, among which {} has the most number of different colors: {}"
print note.format(len(different_colors), different_colors['name'][0], different_colors['colors'][0])

There are 194 countries with different colors in flags, among which Belize has the most number of different colors: 8


In [13]:
symbols = ['circles', 'crosses', 'saltires', 'quarters', 'sunstars']
symbol_column_names = [flags.columns.circles, flags.columns.crosses, flags.columns.saltires, 
                       flags.columns.quarters, flags.columns.sunstars]
for idx, symbol in enumerate(symbol_column_names):
    flags_with_symbols = custom_filter_db(symbol, flags.columns.name)
    if symbols[idx] == 'saltires':
        print "There are {} countries with saltires in flags".format(len(flags_with_symbols))
    else:
        note = "There are {} countries with {} in flags, among which {} has the most number of {}: {}"
        print note.format(len(flags_with_symbols), symbols[idx], flags_with_symbols['name'][0], symbols[idx], flags_with_symbols[symbols[idx]][0])


There are 29 countries with circles in flags, among which Bhutan has the most number of circles: 4
There are 27 countries with crosses in flags, among which Fiji has the most number of crosses: 2
There are 18 countries with saltires in flags
There are 26 countries with quarters in flags, among which Panama has the most number of quarters: 4
There are 80 countries with sunstars in flags, among which USA has the most number of sunstars: 50


In [14]:
# group by landmass, sum population
group_landmass = db.select([db.func.sum(flags.columns.population).label('population'), flags.columns.landmass]).group_by(flags.columns.landmass)
group_landmass = group_landmass.order_by(db.desc('population'))
results = connection.execute(group_landmass).fetchall()
landmass_mapping = {1: 'N.America', 2: 'S.America', 3: 'Europe', 4: 'Africa', 5: 'Asia', 6: 'Oceania'}
df = pd.DataFrame(results, columns=results[0].keys())
df['landmass'] = df['landmass'].apply(lambda v: landmass_mapping[v])
df[['landmass', 'population']]

Unnamed: 0,landmass,population
0,Asia,2698
1,Europe,485
2,Africa,457
3,N.America,381
4,S.America,267
5,Oceania,226


In [15]:
group_topleft = db.select([db.func.count(flags.columns.name).label('count'), flags.columns.topleft]).group_by(flags.columns.topleft)
group_topleft = group_topleft.order_by(db.desc('count'))
results = connection.execute(group_topleft).fetchall()
df = pd.DataFrame(results, columns=results[0].keys())
df[['topleft', 'count']]

Unnamed: 0,topleft,count
0,red,56
1,blue,43
2,white,41
3,green,32
4,black,12
5,gold,6
6,orange,4


In [16]:
group_botright = db.select([db.func.count(flags.columns.name).label('count'), flags.columns.botright]).group_by(flags.columns.botright)
group_botright = group_botright.order_by(db.desc('count'))
results = connection.execute(group_botright).fetchall()
df = pd.DataFrame(results, columns=results[0].keys())
df[['botright', 'count']]

Unnamed: 0,botright,count
0,red,69
1,blue,47
2,green,40
3,white,17
4,black,9
5,gold,9
6,brown,2
7,orange,1


In [17]:
# select only asia and africa
query = db.select([flags.columns.name]).where(flags.columns.landmass.in_([4, 5]))
results = connection.execute(query).fetchall()
asian_african_countries = pd.DataFrame(results, columns=results[0].keys())
asian_african_countries['name'].head(5)

0    Afghanistan
1        Algeria
2         Angola
3        Bahrain
4     Bangladesh
Name: name, dtype: object

In [18]:
# select only asian countries with green color in flags
filter_rows = db.select([flags.columns.name, flags.columns.green])
filter_rows = filter_rows.where(db.and_(flags.columns.green == 1, flags.columns.landmass == 5))
filter_rows = filter_rows.order_by(db.asc(flags.columns.name))
results = connection.execute(filter_rows).fetchall()
pd.DataFrame(results, columns=results[0].keys())

Unnamed: 0,name,green
0,Afghanistan,1
1,Bangladesh,1
2,Hong-Kong,1
3,India,1
4,Iran,1
5,Iraq,1
6,Jordan,1
7,Kuwait,1
8,Lebanon,1
9,Maldive-Islands,1
