In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("/kaggle/input/peterson-barney/verified_pb.data", header=None)

# Save as .csv to a different directory
data.to_csv("/kaggle/working/verified_pb.csv", index=False)

In [4]:
file_path = '/kaggle/input/peterson-barney/verified_pb.data'

This code snippet loads data from a .data file into a DataFrame using pandas' `read_csv` function. The file is assumed to be tab-separated, but the delimiter can be changed by modifying the `delimiter` variable accordingly.

```python
delimiter = '\t'  # Change this according to your file's delimiter

# Load the data into a DataFrame
df = pd.read_csv(file_path, delimiter=delimiter)


In [5]:
delimiter = '\t'  # Change this according to your file's delimiter

# Load the data into a DataFrame
df = pd.read_csv(file_path, delimiter=delimiter)

This code snippet loads data from a .data file into a DataFrame using pandas' `read_csv` function. The file is assumed to be tab-separated, and column names are provided in the `columns` list.



In [6]:
columns = ['gender', 'spkid','vowel_id','vowel', 'F0', 'F1', 'F2', 'F3']

# Load the data into a DataFrame
df = pd.read_csv(file_path, delimiter=delimiter, names=columns)

In [7]:
df.head()

Unnamed: 0,gender,spkid,vowel_id,vowel,F0,F1,F2,F3
0,1,1,1,IY,160.0,240.0,2280.0,2850.0
1,1,1,1,IY,186.0,280.0,2400.0,2790.0
2,1,1,2,IH,203.0,390.0,2030.0,2640.0
3,1,1,2,IH,192.0,310.0,1980.0,2550.0
4,1,1,3,EH,161.0,490.0,1870.0,2420.0


In [8]:
gender_map = {1: 'M', 2: 'W', 3: 'C'}

# Replace numerical values in the 'gender' column with labels
df['gender'] = df['gender'].map(gender_map)

In [9]:
df.head()

Unnamed: 0,gender,spkid,vowel_id,vowel,F0,F1,F2,F3
0,M,1,1,IY,160.0,240.0,2280.0,2850.0
1,M,1,1,IY,186.0,280.0,2400.0,2790.0
2,M,1,2,IH,203.0,390.0,2030.0,2640.0
3,M,1,2,IH,192.0,310.0,1980.0,2550.0
4,M,1,3,EH,161.0,490.0,1870.0,2420.0


In [10]:
df=df.drop(columns=['vowel_id'])

In [11]:
df.tail()

Unnamed: 0,gender,spkid,vowel,F0,F1,F2,F3
1515,C,76,UH,322.0,610.0,1550.0,3400.0
1516,C,76,UW,345.0,520.0,1250.0,3460.0
1517,C,76,UW,334.0,500.0,1140.0,3380.0
1518,C,76,ER,308.0,740.0,1850.0,2160.0
1519,C,76,ER,328.0,660.0,1830.0,2200.0


In [12]:
df['ID'] = df['gender'] + df['spkid'].astype(str)

df = df.drop(columns=['gender', 'spkid'])


In [13]:
df.head()

Unnamed: 0,vowel,F0,F1,F2,F3,ID
0,IY,160.0,240.0,2280.0,2850.0,M1
1,IY,186.0,280.0,2400.0,2790.0,M1
2,IH,203.0,390.0,2030.0,2640.0,M1
3,IH,192.0,310.0,1980.0,2550.0,M1
4,EH,161.0,490.0,1870.0,2420.0,M1


This code calculates the number of IDs with missing values by iterating through each row of the dataframe. It checks for NaN or 0 values in each row. If any such values are found, the corresponding ID is added to a set called `missing_ids`. Finally, it prints the list of missing IDs.


In [14]:
missing_ids = set()

for index, row in df.iterrows():
    if any(pd.isnull(row)) or any(row == 0):
        missing_ids.add(row['ID'])

print("Missing IDs:")
print(list(missing_ids))


Missing IDs:
[]


Since there are no missing IDs in the dataset, there's no need to replace any terms or handle missing values as done for the Hillenbrand data. The dataset is assumed to be complete, and no additional preprocessing for missing values is required.


In [15]:
# Filter out rows where 'ID' starts with 'C'
df = df[~df['ID'].str.startswith('C')]


In [16]:
df.head()

Unnamed: 0,vowel,F0,F1,F2,F3,ID
0,IY,160.0,240.0,2280.0,2850.0,M1
1,IY,186.0,280.0,2400.0,2790.0,M1
2,IH,203.0,390.0,2030.0,2640.0,M1
3,IH,192.0,310.0,1980.0,2550.0,M1
4,EH,161.0,490.0,1870.0,2420.0,M1


In [17]:
frequencies = [col for col in df.columns if col.startswith('F')]

This code initializes an empty list called `all_speaker_arrays` to store concatenated arrays for each speaker. It then iterates over unique speaker IDs in the dataframe and selects data for each speaker. For each speaker, it concatenates all the frequency data for that speaker into a single array. This concatenated array is then stored in the `all_speaker_arrays` list.

In summary, this code concatenates all the data for each speaker, providing a clear and efficient format for further analysis, particularly for linear regression.


In [18]:
import numpy as np

# Get unique speaker IDs
unique_ids = df['ID'].unique()

# Initialize an empty list to store concatenated arrays for each speaker
all_speaker_arrays = []

# Iterate over unique speaker IDs
for speaker_id in unique_ids:
    # Select data for the current speaker ID
    speaker_data = df[df['ID'] == speaker_id]
    
    # Initialize an empty list to store frequencies for the current speaker
    speaker_frequencies = []
    
    # Iterate over frequencies
    for freq in frequencies:
        # Append frequencies for the current frequency to the list
        speaker_frequencies.extend(speaker_data[freq].values)
    
    # Convert the list of frequencies to a NumPy array
    speaker_array = np.array(speaker_frequencies)
    
    # Store the concatenated array for the current speaker ID
    all_speaker_arrays.append(speaker_array)


This code calculates the values of Aij and alpha_ij for each pair of speaker IDs (i, j) using linear regression:

1. It iterates over unique combinations of speaker IDs using tqdm for progress tracking.
2. For each pair of speaker IDs (i, j), it selects the corresponding arrays of frequencies.
3. It fits a linear regression model using array_j as the independent variable and array_i as the dependent variable.
4. After fitting the model, it extracts the coefficients alpha_ij and A_ij.
5. The coefficients are stored in lists along with the corresponding speaker IDs.
6. Finally, a DataFrame (`coefficients_df`) is created to organize the coefficients for each pair of speaker IDs.

This process results in a matrix displaying the calculated values of A_ij and alpha_ij for each pair of speaker IDs.


In [19]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from tqdm import tqdm

# Initialize lists to store data
speaker_ids = []
alpha_ij_values = []
A_ij_values = []

# Iterate over unique combinations of speaker IDs
for i, speaker_id_i in enumerate(tqdm(unique_ids, desc="Calculating Coefficients")):
    for j, speaker_id_j in enumerate(unique_ids):
        if i != j:
            # Select arrays for the current pair of speaker IDs
            array_i = all_speaker_arrays[i].reshape(-1, 1)
            array_j = all_speaker_arrays[j].reshape(-1, 1)
            
            # Fit a linear regression model
            reg = LinearRegression()
            reg.fit(array_j, array_i)
            
            # Extract coefficients
            alpha_ij = reg.coef_[0][0]
            intercept = reg.intercept_[0]
            A_ij = intercept / (alpha_ij - 1)
            
            # Store coefficients for the current pair of speaker IDs
            speaker_ids.append((speaker_id_i, speaker_id_j))
            alpha_ij_values.append(alpha_ij)
            A_ij_values.append(A_ij)

# Create a DataFrame
coefficients_df = pd.DataFrame({
    'Speaker ID (Ri)': [speaker_id[0] for speaker_id in speaker_ids],
    'Speaker ID (Sj)': [speaker_id[1] for speaker_id in speaker_ids],
    'Alpha_ij': alpha_ij_values,
    'A_ij': A_ij_values
})

# Print the coefficients DataFrame
print("Matrix for A_ij and alpha_ij:")
print(coefficients_df)


Calculating Coefficients: 100%|██████████| 61/61 [00:02<00:00, 22.45it/s]

Matrix for A_ij and alpha_ij:
     Speaker ID (Ri) Speaker ID (Sj)  Alpha_ij         A_ij
0                 M1              M2  0.921993 -1415.943759
1                 M1              M3  1.015792  1506.291473
2                 M1              M4  0.994682 -1878.404533
3                 M1              M5  1.069621  -369.912481
4                 M1              M6  0.949062  -650.674789
...              ...             ...       ...          ...
3655             W61             W56  0.981626   844.495770
3656             W61             W57  0.910922  -115.859884
3657             W61             W58  0.982143  2153.820782
3658             W61             W59  0.930672  -242.997475
3659             W61             W60  0.922644  -165.243835

[3660 rows x 4 columns]





In [20]:
# Filter coefficients DataFrame for the first speaker ID
first_speaker_id = unique_ids[0]
first_speaker_coefficients = coefficients_df[coefficients_df['Speaker ID (Ri)'] == first_speaker_id]

for index, row in first_speaker_coefficients.iterrows():
    print(f"A_ij for Speaker ID {row['Speaker ID (Sj)']}: {row['A_ij']}")


A_ij for Speaker ID M2: -1415.9437589945135
A_ij for Speaker ID M3: 1506.2914727042912
A_ij for Speaker ID M4: -1878.4045328494822
A_ij for Speaker ID M5: -369.91248143228034
A_ij for Speaker ID M6: -650.6747890919233
A_ij for Speaker ID M7: -338.2041287401379
A_ij for Speaker ID M8: -991.4003208441382
A_ij for Speaker ID M9: -416.9955802178374
A_ij for Speaker ID M10: -1541.601488527728
A_ij for Speaker ID M11: 50786.1079621569
A_ij for Speaker ID M12: 799.893169784055
A_ij for Speaker ID M13: -1771.2482839816169
A_ij for Speaker ID M14: -2003.377493958007
A_ij for Speaker ID M15: 8047.147717404341
A_ij for Speaker ID M16: -23.41656520623627
A_ij for Speaker ID M17: 201.6918460376909
A_ij for Speaker ID M18: 12058.04741988648
A_ij for Speaker ID M19: -1182.1389080820265
A_ij for Speaker ID M20: 1199.1080545057707
A_ij for Speaker ID M21: -539.3816535292209
A_ij for Speaker ID M22: 607.535109609249
A_ij for Speaker ID M23: 4611.804531668172
A_ij for Speaker ID M24: -432.79989099354196


In this code snippet, a new DataFrame (`ndf`) is created by filtering the original coefficients DataFrame (`coefficients_df`) to include only rows where the absolute value of 'A_ij' is greater than 1000:



In [22]:
ndf = coefficients_df[(coefficients_df['A_ij']<-1000) | (coefficients_df['A_ij']>1000) ]

ndf.head()

Unnamed: 0,Speaker ID (Ri),Speaker ID (Sj),Alpha_ij,A_ij
0,M1,M2,0.921993,-1415.943759
1,M1,M3,1.015792,1506.291473
2,M1,M4,0.994682,-1878.404533
8,M1,M10,0.977139,-1541.601489
9,M1,M11,1.000379,50786.107962


In [23]:
ndf['gender'] = ndf['Speaker ID (Sj)'].map(lambda x : str(x)[0])
ndf.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ndf['gender'] = ndf['Speaker ID (Sj)'].map(lambda x : str(x)[0])


Unnamed: 0,Speaker ID (Ri),Speaker ID (Sj),Alpha_ij,A_ij,gender
0,M1,M2,0.921993,-1415.943759,M
1,M1,M3,1.015792,1506.291473,M
2,M1,M4,0.994682,-1878.404533,M


This code snippet counts the number of male and female speakers for each speaker ID in the filtered DataFrame (`ndf`). It groups the data by 'Speaker ID (Ri)', then iterates over each group to count the unique number of male and female 'Speaker ID (Sj)'.

The results are stored in a new DataFrame (`result_df`) with columns for 'Speaker ID', 'Male Count', and 'Female Count'.


In [24]:
import pandas as pd

# Assuming df is your DataFrame

# Group by 'Speaker ID (Ri)'
grouped_df = ndf.groupby('Speaker ID (Ri)')

# Initialize lists to store results
speaker_ids = []
male_counts = []
female_counts = []

# Iterate over each group
for speaker_id, group in grouped_df:
    male_count = group[group['gender'] == 'M']['Speaker ID (Sj)'].nunique()
    female_count = group[group['gender'] == 'W']['Speaker ID (Sj)'].nunique()
    
    # Append results to lists
    speaker_ids.append(speaker_id)
    male_counts.append(male_count)
    female_counts.append(female_count)

# Create a new DataFrame to store the results
result_df = pd.DataFrame({
    'Speaker ID': speaker_ids,
    'Male Count': male_counts,
    'Female Count': female_counts
})

result_df


Unnamed: 0,Speaker ID,Male Count,Female Count
0,M1,16,1
1,M10,13,2
2,M11,12,1
3,M12,10,0
4,M13,15,2
...,...,...,...
56,W57,2,15
57,W58,14,18
58,W59,2,12
59,W60,2,13


This code calculates the average count of male and female speakers for IDs starting with 'M' and 'W', respectively:

- Average male count for IDs starting with 'M': {avg_male_count_m}
- Average female count for IDs starting with 'M': {avg_female_count_m}
- Average female count for IDs starting with 'W': {avg_female_count_w}
- Average male count for IDs starting with 'W': {avg_male_count_w}


In [25]:
# Calculate average male count for IDs starting with "M"
avg_male_count_m = round(result_df[result_df['Speaker ID'].str.startswith('M')]['Male Count'].mean())

# Calculate average female count for IDs starting with "M"
avg_female_count_m = round(result_df[result_df['Speaker ID'].str.startswith('M')]['Female Count'].mean())

# Calculate average female count for IDs starting with "W"
avg_female_count_w = round(result_df[result_df['Speaker ID'].str.startswith('W')]['Female Count'].mean())

# Calculate average male count for IDs starting with "W"
avg_male_count_w = round(result_df[result_df['Speaker ID'].str.startswith('W')]['Male Count'].mean())

print(f"Average male count for IDs starting with 'M': {avg_male_count_m}")
print(f"Average female count for IDs starting with 'M': {avg_female_count_m}")
print(f"Average female count for IDs starting with 'W': {avg_female_count_w}")
print(f"Average male count for IDs starting with 'W': {avg_male_count_w}")

Average male count for IDs starting with 'M': 13
Average female count for IDs starting with 'M': 2
Average female count for IDs starting with 'W': 11
Average male count for IDs starting with 'W': 5


In [26]:
result_df.to_csv('petersonbarney_result.csv', index=False)
