In [8]:
import pandas as pd
import numpy as np
import time
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn import cross_validation
from sklearn.cross_validation import KFold
from sklearn.cross_validation import LeaveOneOut

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

In [9]:
#Load data
df = pd.read_csv('/Users/kevenlemoing/Sites/sandvik_code_assignement/data/indicators.csv',error_bad_lines=False)
df.head()

Unnamed: 0,folder,record_name,mean_frequency,standard_deviation,median_frequency,first_quantile,third_quantile,inter_quantile_range,kurtosis,skewness,speaker_name,speaker_gender,speaker_age_range,speaker_language,speaker_dialect,sampling_rate,sample_rate_format
0,1028-20100710-hne,rp-29,368.604505,140.704429,368.604505,289.01006,328.807282,99.493056,-2.0,0.0,1028,Male,Adult,EN,American English,48000\n,16\n
1,1028-20100710-hne,rp-30,294.95836,119.206456,294.95836,227.525005,261.241683,84.291694,-2.0,4.859547e-16,1028,Male,Adult,EN,American English,48000\n,16\n
2,1028-20100710-hne,rp-31,337.401379,70.014478,337.401379,297.795209,317.598294,49.507712,-2.0,0.0,1028,Male,Adult,EN,American English,48000\n,16\n
3,1028-20100710-hne,ar-01,347.713169,136.622691,347.713169,270.427704,309.070436,96.606831,-2.0,0.0,1028,Male,Adult,EN,American English,48000\n,16\n
4,1028-20100710-hne,ar-02,372.315279,156.235835,372.315279,283.934944,328.125112,110.475419,-2.0,0.0,1028,Male,Adult,EN,American English,48000\n,16\n


### Predictions considering all of audio features 

In [10]:
start_time = time.time()

#Data organization
label_names = ['Male','Female']
labels = df['speaker_gender']
feature_names = ['mean_frequency','standard_deviation','median_frequency','first_quantile','third_quantile',
                 'inter_quantile_range','kurtosis','skewness']
features = df[['mean_frequency','standard_deviation','median_frequency','first_quantile','third_quantile',
            'inter_quantile_range','kurtosis','skewness']].values    

print(label_names)
print('Class label = ', labels[0])
print(feature_names)
print(features[0])

# Split our data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)
# Initialize our classifier
rfc = RandomForestClassifier()

# Train our classifier
model = rfc.fit(train, train_labels)

# Make predictions
preds = rfc.predict(test)
print(preds)

# Evaluate accuracy
print(accuracy_score(test_labels, preds))

print("")
print("--- %s seconds ---" % (time.time() - start_time))
print("")

['Male', 'Female']
('Class label = ', 'Male')
['mean_frequency', 'standard_deviation', 'median_frequency', 'first_quantile', 'third_quantile', 'inter_quantile_range', 'kurtosis', 'skewness']
[ 368.60450482  140.70442937  368.60450482  289.0100599   328.80728236
   99.49305615   -2.            0.        ]
['Male' 'Male' 'Male' ..., 'Male' '[female];' 'Male']
0.636064257028

--- 1.8435959816 seconds ---



### Predictions just considering standard deviation and inter quantile range

In [11]:
start_time = time.time()

#Data organization
label_names = ['Male','Female']
labels = df['speaker_gender']
feature_names = ['standard_deviation','inter_quantile_range','first_quantile']
features = df[['standard_deviation','inter_quantile_range','first_quantile']].values    

print(label_names)
print('Class label = ', labels[0])
print(feature_names)
print(features[0])

# Split our data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)
# Initialize our classifier
clf_rfc = RandomForestClassifier()

# Train our classifier
model = clf_rfc.fit(train, train_labels)

# Make predictions
preds = clf_rfc.predict(test)
print(preds)

# Evaluate accuracy
print(accuracy_score(test_labels, preds))

print("")
print("--- %s seconds ---" % (time.time() - start_time))
print("")

['Male', 'Female']
('Class label = ', 'Male')
['standard_deviation', 'inter_quantile_range', 'first_quantile']
[ 140.70442937   99.49305615  289.0100599 ]
['Male' 'Male' 'Male' ..., 'Female' 'Male' 'Male']
0.640803212851

--- 1.33476400375 seconds ---



### Predictions with clustered data (Adult, EN, American english)

In [12]:
start_time = time.time()

#Dataframe filtered for clustered data
df = df.loc[(df['speaker_age_range'] == 'Adult') & (df['speaker_language'] == 'EN') & (df['speaker_dialect'] == 'American English')]

#Data organization
label_names = ['Male','Female']
labels = df['speaker_gender']
feature_names = ['standard_deviation','inter_quantile_range','first_quantile']
features = df[['standard_deviation','inter_quantile_range','first_quantile']].values    

print(label_names)
print('Class label = ', labels[0])
print(feature_names)
print(features[0])

# Split our data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)
# Initialize our classifier
clf_rfc = RandomForestClassifier()

# Train our classifier
model = clf_rfc.fit(train, train_labels)

# Make predictions
preds = clf_rfc.predict(test)
print(preds)

# Evaluate accuracy
print(accuracy_score(test_labels, preds))

print("")
print("--- %s seconds ---" % (time.time() - start_time))
print("")

['Male', 'Female']
('Class label = ', 'Male')
['standard_deviation', 'inter_quantile_range', 'first_quantile']
[ 140.70442937   99.49305615  289.0100599 ]
['Male' 'Male' 'Male' ..., 'Male' 'Male' 'Male']
0.903380423814

--- 0.326842069626 seconds ---

