### Könyvtárak importálása

In [7]:
import pandas as pd
import numpy as np
import researchpy as rp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import os
ldir = os.chdir(r'C:\Users\Gergely\Documents\Datasets')

### Dataset beolvasása

In [2]:
df = pd.read_csv('Cars.csv')
df.head()

Unnamed: 0,Acceleration,Cylinders,Displacement,Horsepower,Miles_per_Gallon,Name,Origin,Weight_in_lbs,Year
0,12.0,8,307.0,130.0,18.0,chevrolet chevelle malibu,USA,3504,1970-01-01
1,11.5,8,350.0,165.0,15.0,buick skylark 320,USA,3693,1970-01-01
2,11.0,8,318.0,150.0,18.0,plymouth satellite,USA,3436,1970-01-01
3,12.0,8,304.0,150.0,16.0,amc rebel sst,USA,3433,1970-01-01
4,10.5,8,302.0,140.0,17.0,ford torino,USA,3449,1970-01-01


### Folytonos változóból kategóriák készítése (binnelés)

In [3]:
performance = pd.cut(df['Horsepower'], bins=[0, 80, 150, 250], labels=['Gyenge', 'Közepes', 'Erős'])
performance = pd.Series(performance)
performance

df = pd.concat([df, performance.rename('Performance')], axis=1)
df.head()

Unnamed: 0,Acceleration,Cylinders,Displacement,Horsepower,Miles_per_Gallon,Name,Origin,Weight_in_lbs,Year,Performance
0,12.0,8,307.0,130.0,18.0,chevrolet chevelle malibu,USA,3504,1970-01-01,Közepes
1,11.5,8,350.0,165.0,15.0,buick skylark 320,USA,3693,1970-01-01,Erős
2,11.0,8,318.0,150.0,18.0,plymouth satellite,USA,3436,1970-01-01,Közepes
3,12.0,8,304.0,150.0,16.0,amc rebel sst,USA,3433,1970-01-01,Közepes
4,10.5,8,302.0,140.0,17.0,ford torino,USA,3449,1970-01-01,Közepes


### Crosstab, darabszám (pandas - pd.crosstab)

In [5]:
crosstab = pd.crosstab(df['Origin'], df['Performance'])
crosstab

Performance,Gyenge,Közepes,Erős
Origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Europe,41,30,0
Japan,46,33,0
USA,33,168,49


### Khi^2, p-value, szabadságfok, expected frequencies

In [12]:
stats.chi2_contingency(crosstab)

(102.43347949607889,
 2.983062129411953e-21,
 4,
 array([[ 21.3   ,  41.0025,   8.6975],
        [ 23.7   ,  45.6225,   9.6775],
        [ 75.    , 144.375 ,  30.625 ]]))

### Crosstab, százalék (researchpy - rp.crosstab)

In [17]:
table, results = rp.crosstab(df['Origin'], df['Performance'], prop= 'col', test= 'chi-square')
table

Unnamed: 0_level_0,Performance,Performance,Performance,Performance
Unnamed: 0_level_1,Gyenge,Közepes,Erős,All
Origin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Europe,34.17,12.99,0.0,17.75
Japan,38.33,14.29,0.0,19.75
USA,27.5,72.73,100.0,62.5
All,100.0,100.0,100.0,100.0


### Khi^2, p-value, Cramer's V

In [18]:
results

Unnamed: 0,Chi-square test,results
0,Pearson Chi-square ( 4.0) =,102.4335
1,p-value =,0.0
2,Cramer's V =,0.3578


### Összefüggés szétbontása

In [23]:
dummies = pd.get_dummies(df['Origin'])
dummies.head()

Unnamed: 0,Europe,Japan,USA
0,0,0,1
1,0,0,1
2,0,0,1
3,0,0,1
4,0,0,1


In [24]:
for series in dummies:
    nl = "\n"
    
    crosstab = pd.crosstab(dummies[f"{series}"], df['Performance'])
    print(crosstab, nl)
    chi2, p, dof, expected = stats.chi2_contingency(crosstab)
    print(f"Chi2 value= {chi2}{nl}p-value= {p}{nl}Degrees of freedom= {dof}{nl}")

Performance  Gyenge  Közepes  Erős
Europe                            
0                79      201    49
1                41       30     0 

Chi2 value= 36.316193189347004
p-value= 1.3002858816841532e-08
Degrees of freedom= 2

Performance  Gyenge  Közepes  Erős
Japan                             
0                74      198    49
1                46       33     0 

Chi2 value= 42.55763427655064
p-value= 5.737558616197178e-10
Degrees of freedom= 2

Performance  Gyenge  Közepes  Erős
USA                               
0                87       63     0
1                33      168    49 

Chi2 value= 102.42909090909092
p-value= 5.725395810854483e-23
Degrees of freedom= 2

