In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv('Columns_result.csv')
df.describe()

Unnamed: 0,Age,Word Count CTD,Total Wait Time (s) CTD,Word Count PFT,Total Wait Time (s) PFT,Word Count SFT,Total Wait Time (s) SFT,Converted-MMSE
count,157.0,157.0,157.0,157.0,157.0,157.0,157.0,69.0
mean,65.77707,157.433121,7.917197,32.694268,26.509554,49.292994,18.254777,27.362319
std,12.29634,91.04312,8.103467,15.93161,10.884368,21.727998,9.684712,2.467262
min,23.0,12.0,0.0,8.0,4.0,16.0,1.0,19.0
25%,62.0,90.0,2.0,21.0,18.0,33.0,12.0,27.0
50%,66.0,135.0,5.0,30.0,26.0,45.0,17.0,28.0
75%,72.0,200.0,11.0,41.0,35.0,61.0,24.0,29.0
max,94.0,461.0,39.0,89.0,51.0,120.0,46.0,30.0


In [4]:
# Überblick über die Datentypen verschaffen
df_info = { "data_types": df.dtypes }
df_info

{'data_types': Record-ID                   object
 TrainOrDev                  object
 Class                       object
 Gender                      object
 Age                        float64
 Word Count CTD             float64
 Total Wait Time (s) CTD    float64
 Word Count PFT             float64
 Total Wait Time (s) PFT    float64
 Word Count SFT             float64
 Total Wait Time (s) SFT    float64
 Converted-MMSE             float64
 dtype: object}

In [5]:
# Numerische Spalten für die Korrelationsanalyse
numeric_columns = df.select_dtypes(include = ['float64']).columns
correlation_matrix = df[numeric_columns].corr()

# Korrelation der numerischen Spalten mit "Converted-MMSE"
mmse_correlation = correlation_matrix['Converted-MMSE'].sort_values(ascending = False)
mmse_correlation

Converted-MMSE             1.000000
Word Count CTD             0.194882
Word Count SFT             0.004747
Total Wait Time (s) PFT   -0.176075
Word Count PFT            -0.231698
Total Wait Time (s) SFT   -0.284651
Age                       -0.335210
Total Wait Time (s) CTD   -0.369492
Name: Converted-MMSE, dtype: float64

In [6]:
# Datenvorbereitung
# Merkmale mit signifikanter Korrelation auswählen
features = ['Word Count CTD', 'Age', 'Total Wait Time (s) CTD']

# Daten ohne fehlende Werte (für Modelltraining)
train_data = df.dropna(subset = ['Converted-MMSE'])
X_train = train_data[features]
y_train = train_data['Converted-MMSE']

# Fehlende Werte für Vorhersage
missing_data = df[df['Converted-MMSE'].isnull()]
X_missing = missing_data[features]

# Train-Test-Split (für Modellbewertung)
X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train, y_train, test_size = .2, random_state = 42)

# Modelltraining
rf_model = RandomForestRegressor(random_state = 42, n_estimators = 100)
rf_model.fit(X_train_split, y_train_split)

# Modellbewertung
y_pred = rf_model.predict(X_test_split)
rmse = mean_squared_error(y_test_split, y_pred, squared = False)

# Fehlende Werte vorhersagen
missing_predictions = rf_model.predict(X_missing)

# RMSE ausgeben und fehlende Werte imputieren
# Zeigt die RMSE und die ersten 5 Vorhersagen
rmse, missing_predictions[:5]



(np.float64(1.5768254727231457), array([28.47, 28.12, 28.06, 28.9 , 28.74]))

In [7]:
# Sicherstellen, dass Indizes korrekt sind
missing_indices = df[df['Converted-MMSE'].isnull()].index

# Fehlende Werte durch gerundete Vorhersagen ersetzen
df.loc[missing_indices, 'Converted-MMSE'] = missing_predictions.round().astype(int)

In [9]:
# Gespeicherte Tabelle mit ausgefüllten Werten
df.to_csv("Columns_filled_mmse.csv", index = False)