# Step 3 - Create Dataset Only

Sources: 
Previous Weeks Code, Generative AI, 
https://cran.r-project.org/web/packages/celltrackR/vignettes/clustering.html

### 1. Read Data

In [1]:
# Imports
import pandas as pd

# Read in 2 Data Files and Combine, Mark as M/F
female_data = pd.read_csv('file_list_female_parsed_with_data_failed.csv')
male_data = pd.read_csv('file_list_male_parsed_with_data.csv')
female_data['gender'] = 0  # Female data is 0
male_data['gender'] = 1    # Male data is 1
combined_data = pd.concat([female_data, male_data], ignore_index=True)
combined_data.to_csv('combined_dataset.csv', index=False)
print(combined_data.shape)
print(combined_data.head())

(752827, 17)
                experiment_name video_base_name  track_id  count_uncalibrated  \
0  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                   0   
1  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                   1   
2  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                  10   
3  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                 100   
4  MC_singlenuc23_1_Tk33_021220        0001_vid      1330                1000   

   new_count_uncalibrated          xc          yc     u_dot      v_dot  \
0                  382058  117.405639  104.372970  6.973857  22.796302   
1                  382059  123.302195  123.913192  6.585702  21.623125   
2                  382068  111.683189  400.633206 -0.382109  28.731078   
3                  382158  821.635502  527.801237  5.468046  -3.778739   
4                  383058  883.833486  309.388663  2.265553   0.529649   

         s_dot  class_id   p_value  track_length       

In [2]:
# Rename Data
data = combined_data

# Save Class ID for Later Comparison
class_id_array = data[['gender', 'track_id']].copy()

# Drop Potential Labels
data.drop('gender', inplace=True, axis=1) 

# Drop Un-needed Columns
data.drop('video_base_name', inplace=True, axis=1) 

# Drop Cols that We Might Need
data.drop('w', inplace=True, axis=1) 
data.drop('h', inplace=True, axis=1) 
data.drop('p_value', inplace=True, axis=1) 
data.drop('class_id', inplace=True, axis=1)
data.drop('track_length', inplace=True, axis=1) 
data.drop('experiment_name', inplace=True, axis=1) 
data.drop('count_uncalibrated', inplace=True, axis=1) 
data.drop('new_count_uncalibrated', inplace=True, axis=1) 

# Check Modified Data
print("Modified CSV Data: \n") 
print(data)

# Rename DF
df = data

Modified CSV Data: 

        track_id          xc          yc     u_dot      v_dot        s_dot  \
0           1330  117.405639  104.372970  6.973857  22.796302  4094.324244   
1           1330  123.302195  123.913192  6.585702  21.623125  4154.148577   
2           1330  111.683189  400.633206 -0.382109  28.731078  1993.790100   
3           1330  821.635502  527.801237  5.468046  -3.778739    82.335724   
4           1330  883.833486  309.388663  2.265553   0.529649    -4.429363   
...          ...         ...         ...       ...        ...          ...   
752822      9350  156.276651  671.360475  4.437002   6.482767    23.792231   
752823      9350  162.206921  672.395285  4.571728   5.991240    25.240829   
752824      9350  170.841343  672.257588  4.938269   5.438280    26.805397   
752825      9350  178.862178  673.323482  5.216379   5.043803    29.915899   
752826      9350  187.945173  674.862680  5.565223   4.727619    30.420474   

         frame  
0       382058  
1       

### 2. Combine Frames by Track & Create Features

In [3]:
# Imports
import numpy as np

In [4]:
# Create Features
def calculate_speed(df):
    df['speed'] = np.sqrt(df['u_dot']**2 + df['v_dot']**2)
    return df

def calculate_mean_turning_angle(df):
    df['angle'] = np.arctan2(df['v_dot'], df['u_dot'])
    df['turning_angle'] = df['angle'].diff().abs()
    mean_turning_angle = df['turning_angle'].mean()
    return mean_turning_angle

def calculate_outreach_ratio(df):
    start_x, start_y = df.iloc[0][['xc', 'yc']]
    end_x, end_y = df.iloc[-1][['xc', 'yc']]
    displacement = np.sqrt((end_x - start_x)**2 + (end_y - start_y)**2)
    path_length = df['speed'].sum()
    outreach_ratio = displacement / path_length if path_length != 0 else 0
    return outreach_ratio

def calculate_square_displacement(df):
    start_x, start_y = df.iloc[0][['xc', 'yc']]
    displacements = np.sqrt((df['xc'] - start_x)**2 + (df['yc'] - start_y)**2)
    square_displacement = np.sum(displacements**2)
    return square_displacement

In [5]:
# Combine Frames by Track and Calculate Feature Values
features = []

# Group by track_id
grouped = df.groupby('track_id')

for track_id, group in grouped:
    group = group.sort_values(by='frame')
    speed = calculate_speed(group)['speed'].mean()
    mean_turning_angle = calculate_mean_turning_angle(group)
    outreach_ratio = calculate_outreach_ratio(group)
    square_displacement = calculate_square_displacement(group)
    
    features.append({
        'track_id': track_id,
        'speed': speed,
        'mean_turning_angle': mean_turning_angle,
        'outreach_ratio': outreach_ratio,
        'square_displacement': square_displacement
    })

# Convert to DF
features_df = pd.DataFrame(features)

# Check New Features
print(features_df.head())

   track_id     speed  mean_turning_angle  outreach_ratio  square_displacement
0         6  3.827775            0.103121        0.136342         1.127503e+08
1        14  5.366490            0.159354        0.592727         1.697657e+07
2        17  3.123633            0.145173        0.083003         3.502969e+08
3        18  1.663800            0.303789        0.027139         8.095789e+08
4        31  4.150165            0.139008        0.056169         4.782558e+08


### 3. Prepare Data

In [6]:
# Imports
from sklearn.preprocessing import StandardScaler

In [7]:
# Standardize Data
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features_df[['speed', 'mean_turning_angle', 'outreach_ratio', 'square_displacement']])
normalized_features_df = pd.DataFrame(normalized_features, columns=['speed', 'mean_turning_angle', 'outreach_ratio', 'square_displacement'])

# Add track_id Back
normalized_features_df['track_id'] = features_df['track_id']

# Check Data
data = normalized_features_df
print(data.head())

      speed  mean_turning_angle  outreach_ratio  square_displacement  track_id
0  0.017104           -0.763880       -0.095906            -0.526893         6
1  0.903157           -0.088360        2.758743            -0.734142        14
2 -0.388369           -0.258712       -0.429537            -0.012855        17
3 -1.228999            1.646708       -0.778964             0.981006        18
4  0.202749           -0.332778       -0.597380             0.264041        31


In [8]:
class_id_array.to_csv('class_id_array.csv', index=False)
data.to_csv('data.csv', index=False)