# Загрузка данных и поиск стран с минимум 30 респондентами в выборке.

In [1]:
!pip install kaggle

In [None]:
# нужно загрузить kaggle.json
from google.colab import files
files.upload()
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

# скачивание датасета
!kaggle datasets download -d rtatman/speech-accent-archive

# разархивация
!unzip 'speech-accent-archive.zip'

In [4]:
import pandas as pd
import numpy as np
df = pd.read_csv("speakers_all.csv", header=0)

In [5]:
languages = df.groupby('country')['speakerid'].nunique().sort_values()

#страны для которых в базе есть более 30 записей
arr = np.array([[i,languages.to_dict()[i]] for i in languages.to_dict().keys() if languages.to_dict()[i]>=30])
for i in arr:
  print(i)

['ethiopia' '31']
['russia' '31']
['italy' '32']
['germany' '32']
['australia' '33']
['saudi arabia' '33']
['poland' '34']
['turkey' '35']
['belgium' '36']
['brazil' '39']
['south korea' '51']
['canada' '54']
['india' '59']
['uk' '67']
['china' '88']
['usa' '393']


In [9]:
df_filter = df['country'].isin(arr[:,0])

# получим новую таблицу с нужными нам записями
df_new = df[df_filter]
df_new.info

# на первые три записи отсутствуют аудиофайлы, уберем их
df_new = df_new.drop(labels =[9,10,20] ,axis = 0) 
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1045 entries, 50 to 2170
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1045 non-null   float64
 1   age_onset        1045 non-null   float64
 2   birthplace       1045 non-null   object 
 3   filename         1045 non-null   object 
 4   native_language  1045 non-null   object 
 5   sex              1045 non-null   object 
 6   speakerid        1045 non-null   int64  
 7   country          1045 non-null   object 
 8   file_missing?    1045 non-null   bool   
 9   Unnamed: 9       0 non-null      float64
 10  Unnamed: 10      0 non-null      float64
 11  Unnamed: 11      1 non-null      object 
dtypes: bool(1), float64(4), int64(1), object(6)
memory usage: 99.0+ KB


# Получение суммы zero-crossing по каждой из записей.

In [None]:
!pip install librosa
import librosa

In [None]:
df_new.rename(columns = {'Unnamed: 9' : 'zero_crossing'}, inplace = True)

j = 0
for i in df_new['filename'].values:
  x,sr = librosa.load('recordings/recordings/'+ i +'.mp3')

  zero_crossings = librosa.zero_crossings(x, pad=False)
  df_new.at[df_new.index[j],'zero-crossing'] = sum(zero_crossings)
  j+=1

In [14]:
df_new.sample(5)

Unnamed: 0,age,age_onset,birthplace,filename,native_language,sex,speakerid,country,file_missing?,zero-crossing,Unnamed: 10,Unnamed: 11
315,39.0,12.0,"essen, belgium",dutch10,dutch,male,913,belgium,False,74358.0,,
770,18.0,0.0,"portland, maine, usa",english466,english,male,1637,usa,False,108171.0,,
1627,26.0,24.0,"juiz de fora, brazil",portuguese30,portuguese,female,1285,brazil,False,39839.0,,
504,38.0,0.0,"walton-on-thames, surrey, uk",english226,english,male,772,uk,False,62212.0,,
590,45.0,0.0,"anaheim, california, usa",english303,english,female,1086,usa,False,85737.0,,


# ANOVA-анализ по аттрибутам родного языка, пола и возраста для уровня значимости 0.15. 

In [19]:
from statsmodels.formula.api import ols
import statsmodels.api as sm

In [23]:
lm = ols('zero_crossing ~ C(native_language) * C(sex) * age', data=df_new).fit()
table = sm.stats.anova_lm(lm, typ=1)
table

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(native_language),55.0,262660300000.0,4775643000.0,6.723694,6.596541e-39
C(sex),1.0,11872530000.0,11872530000.0,16.715496,4.724826e-05
C(native_language):C(sex),55.0,88797660000.0,1614503000.0,2.273081,8.381733e-07
age,1.0,12050390000.0,12050390000.0,16.965907,4.150735e-05
C(native_language):age,55.0,66031140000.0,1200566000.0,1.690294,0.001601797
C(sex):age,1.0,122327200.0,122327200.0,0.172226,0.6782383
C(native_language):C(sex):age,55.0,37932140000.0,689675300.0,0.971004,0.5362277
Residual,914.0,649187400000.0,710270600.0,,


In [24]:
table[table['PR(>F)'] < 0.15]

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(native_language),55.0,262660300000.0,4775643000.0,6.723694,6.596541e-39
C(sex),1.0,11872530000.0,11872530000.0,16.715496,4.724826e-05
C(native_language):C(sex),55.0,88797660000.0,1614503000.0,2.273081,8.381733e-07
age,1.0,12050390000.0,12050390000.0,16.965907,4.150735e-05
C(native_language):age,55.0,66031140000.0,1200566000.0,1.690294,0.001601797
