In [1]:
import pandas as pd
import numpy as np

### Chi-squared always applied on nominal data

In [2]:
file = pd.read_csv("income.csv")
file

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [3]:
file["sex"].value_counts()

 Male      21790
 Female    10771
Name: sex, dtype: int64

In [4]:
file.shape[0]/2 #half of the total sample

16280.5

In [5]:
observed = [21790,10771]
expected = [16280.5,16280.5]

male = (21790 - 16280.5)/16280.5
female = (10771 - 16280.5)/16280.5

change = male + female
change

0.0

In [6]:
male = ((21790 - 16280.5)**2)/16280.5
female = ((10771 - 16280.5)**2)/16280.5

chi_sq = male + female
chi_sq #chi-squared of gender

3728.950615767329

In [7]:
gender_chi_sq = []
for i in range(1000):
    sequence = np.random.rand(32561)
    sequence[sequence < 0.5] = 0
    sequence[sequence >= 0.5] = 1
    
    male = len(sequence[sequence == 0])
    female = len(sequence[sequence == 1])
    
    male_change = ((male - 16280.5)**2)/16280.5
    female_change = ((female - 16280.5)**2)/16280.5
    
    chi_sq = male_change + female_change
    gender_chi_sq.append(chi_sq)

In [8]:
gender_chi_sq

[0.2771720770246614,
 0.24326648444458093,
 3.071158748195694e-05,
 0.2771720770246614,
 0.35161696508092505,
 5.037468136727988,
 0.3258192315960812,
 0.15481711249654495,
 0.0007677896870489236,
 0.15481711249654495,
 0.5933785817388901,
 1.216209575872977,
 3.00878351401984,
 0.037621694665397255,
 4.938453978686158,
 0.4646356070145266,
 1.7250391572740396,
 0.815976167808114,
 0.5110715272872455,
 0.1916710174748933,
 1.4461779429378705,
 1.2906544639292405,
 1.4461779429378705,
 1.8134885292220755,
 2.70903842019594,
 1.1439759221154142,
 0.146217868001597,
 0.7960750591198059,
 2.1893983600012286,
 3.071158748195694e-05,
 0.1218942907158871,
 0.2324560056509321,
 4.889315438715027,
 1.1439759221154142,
 0.8361229691962777,
 0.07988083904057,
 0.35161696508092505,
 0.06784189674764289,
 0.4646356070145266,
 0.6457111268081447,
 0.2324560056509321,
 1.2655938085439635,
 0.815976167808114,
 3.7407020668898374,
 0.06784189674764289,
 0.12975645711126807,
 0.006910107183440312,
 0.01

In [9]:
#Finding p-value

counter = 0
for i in gender_chi_sq:
    if i >= 3728.950615767329:
        counter += 1

In [10]:
counter

0

In [11]:
p_value = counter/1000
p_value

0.0

### Directly finding chi squared and p value from library  

In [12]:
from scipy.stats import chisquare

chi_sq,p_value = chisquare(observed, expected)
p_value

0.0

In [13]:
chi_sq

3728.950615767329

### Assignment  

In [14]:
file["race"].value_counts()

 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: race, dtype: int64

expected values:
    white: 80.3%
    
    black: 12.1%
    
    asian: 2.9%
    
    indian: 0.8%
    
    other: 3.9%

In [15]:
80.3 + 2.9 + 3.9 + 12.1 + 0.8

100.0

In [29]:
total = 32561
white = (80.3/100) * total
black = (12.1/100) * total
asian = (2.9/100) * total
indian = (0.8/100) * total
other = (3.9/100) * total

In [31]:
expected = [white,black,asian,indian,other]
expected

[26146.482999999997, 3939.881, 944.2689999999999, 260.488, 1269.879]

In [32]:
file["race"].value_counts()

 White                 27816
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: race, dtype: int64

In [33]:
observed = [27816,3124,1039,311,271]

In [35]:
chi_sq, p_value = chisquare(observed,expected)


1.2331921773233437e-232

In [36]:
print(chi_sq,p_value)

1080.568159486447 1.2331921773233437e-232


in cross tab, expected values need to be calculated

In [16]:
cross_tab = pd.crosstab(file["sex"],file["high_income"])
cross_tab # males and females having incomes greater than and lesser than 50k

high_income,<=50K,>50K
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,9592,1179
Male,15128,6662


In [17]:
observed = [9592,1179,15128,6662]

female = 10771
male = 21790

In [18]:
f_less50k = 9592/32561
f_above50k = 1179/32561
m_less50k = 15128/32561
m_above50k = 6662/32561

In [19]:
df = pd.DataFrame({"<=50K": [f_less50k, m_less50k,f_less50k+m_less50k], ">50K": [f_above50k,m_above50k,f_above50k+m_above50k],
                   "Total": [f_less50k+f_above50k, m_less50k+m_above50k,f_less50k+m_less50k+f_above50k+m_above50k]},
                  index = ["Female","Male","Total"])
df

Unnamed: 0,<=50K,>50K,Total
Female,0.294586,0.036209,0.330795
Male,0.464605,0.204601,0.669205
Total,0.75919,0.24081,1.0


In [20]:
exp_f_less50k = 0.330795*0.759190*32561
exp_f_above50k = 0.330795*0.240810*32561
exp_m_less50k = 0.669205*0.759190*32561
exp_m_above50k = 0.669205*0.240810*32561


In [21]:
expected = [exp_f_less50k,exp_f_above50k,exp_m_less50k,exp_m_above50k]
expected

[8177.24763324405, 2593.76836175595, 16542.73795675595, 5247.24604824405]

In [22]:


chi_sq,p_value = chisquare(observed, expected)
print(chi_sq,p_value)

1518.8837734359522 0.0


In [23]:
table = pd.crosstab(file["sex"], file["race"])
table

race,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,119,346,1555,109,8642
Male,192,693,1569,162,19174


In [24]:
from scipy.stats import chi2_contingency

In [25]:
chi_sq, p_value, df, expected = chi2_contingency(table) 

df is degree of freedom. 

In [26]:
expected

array([[  102.87709223,   343.69549461,  1033.40204539,    89.64531188,
         9201.3800559 ],
       [  208.12290777,   695.30450539,  2090.59795461,   181.35468812,
        18614.6199441 ]])

finding expected values from crosstab when values increase is difficult. by finding proportion and solving etc. so we import the chi2_contingency library which directly gives us the expected value along with other useful data just by inputting the table

In [27]:
p_value

5.192061302760456e-97

In [28]:
df # if we change 1 value in our table, 4 other values will be altered

4