In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy import stats

In [2]:
brain = pd.read_csv('brain.tsv', sep='\t')
carprefs = pd.read_csv('carprefs.tsv', sep='\t')

**Task 1: Brain correlations**

In [3]:
brain.head()

Unnamed: 0,Gender,FSIQ,VIQ,PIQ,Weight,Height,MRI_Count
0,Female,133,132,124,118,64.5,816932
1,Male,140,150,124,�,72.5,1001121
2,Male,139,123,150,143,73.3,1038437
3,Male,133,129,128,172,68.8,965353
4,Female,137,132,134,147,65.0,951545


In [4]:
def t(r, n):
    return r * (n - 2) ** 0.5 / (1 - r ** 2) ** 0.5


def correlation_pearson(x, y, alpha):
    corr = stats.pearsonr(x, y)[0]
    n = len(x)
    p_value = stats.t.cdf(t(corr, n), df=n - 2)
    kind = 1 if p_value > 1 - alpha else -1 if p_value < alpha else 0
    return kind, p_value


def print_correlations(df):
    for iq in ['FSIQ', 'VIQ', 'PIQ']:
        corr, p_value = correlation_pearson(df[iq], df.MRI_Count, alpha=0.05)
        kind = 'positive' if corr == 1 else 'negative'
        print(str(iq).ljust(5), ' : ', 'no correlation (p_value={})'.format(p_value) if corr == 0 else '{} correlation with p_value={}'.format(kind, p_value))

Since we are dealing with the continious random values, we will use the Pearson correlation coefficient:

In [5]:
print_correlations(brain)

FSIQ   :  positive correlation with p_value=0.9882655436865492
VIQ    :  positive correlation with p_value=0.9833991186535366
PIQ    :  positive correlation with p_value=0.9931626819819634


In the general population we can assume that we have a positive correlation between all of the kinds of IQ values and the brain size.

In [6]:
print_correlations(brain[brain.Gender == 'Male'])

FSIQ   :  positive correlation with p_value=0.9873402993251286
VIQ    :  positive correlation with p_value=0.9648796808617768
PIQ    :  positive correlation with p_value=0.9955250625441796


Same situation in the male part of the general population.

In [7]:
print_correlations(brain[brain.Gender == 'Female'])

FSIQ   :  no correlation (p_value=0.9194421119054619)
VIQ    :  no correlation (p_value=0.8609801836049074)
PIQ    :  positive correlation with p_value=0.9581096403159112


But in the female population we have a correlation only in case of the PIQ value, which actually has a pretty low p-value, so we can assume that in case of the females we have no correlation between the IQ and the brain size.

**Task 2: Car preferences**

In [8]:
carprefs.head()

Unnamed: 0,ID,Age,Sex,LicYr,LicMth,ActCar,Kids5,Kids6,PreferCar,Car15K,...,Reliable,Perform,Fuel,Safety,AC/PS,Park,Room,Doors,Prestige,Colour
0,110,18,2,0,2,3,2,2,2,2,...,4,3,3,3,3,3,3,3,3,2
1,111,25,1,8,0,1,2,2,1,1,...,4,4,4,4,3,4,4,2,3,3
2,112,63,2,46,0,3,2,2,3,2,...,3,3,3,3,3,3,3,3,3,3
3,113,51,1,35,0,3,2,2,2,2,...,3,3,3,4,3,3,3,3,1,3
4,114,19,1,2,0,2,2,2,2,1,...,3,4,3,3,2,3,3,2,3,3


In [9]:
def chi_squared(table, alpha):
    n = table.sum(axis=0)
    m = table.sum(axis=1)
    nall, r, s = n.sum(), len(m), len(n)
    chi_value = 0
    for i in range(1, r):
        for j in range(1, s):
            chi_value += (table.iloc[i][j] - m[i] * n[j] / nall) ** 2 / (m[i] * n[j] / nall)
    p_value = stats.chi.cdf(chi_value, df=(r - 1) * (s - 1))
    corr = 1 if p_value > 1 - alpha else -1 if p_value < alpha else 0
    return corr, p_value

Since we are dealing with a categorical random values here, we will use Chi-squared test to find out if there's a correlation between the car size prefernces and gender.

In [10]:
table = pd.pivot_table(carprefs[['Sex', 'PreferCar']], index=['Sex'], columns=['PreferCar'], aggfunc=np.size)

In [11]:
table

PreferCar,1,2,3,4
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,86,150,52,12
2,50,122,94,34


In [12]:
chi_squared(table, 0.05)

(1, 1.0)

With a p_value = 1 we can definetely assume that there is a strong correlation between gender and car size preferences - males can be assumed to prefer bigger cars.