In [None]:
import os
import pandas as pd

# Load the CSV file with the correct encoding and specify headers
file_path = 'texts_2024_1120_aggregate.csv'

# Define the headers
headers = ['fullname', 'Measures', 'Amount', 'Level', 'Function']  # Replace with your actual headers

df = pd.read_csv(file_path, encoding='latin1', names=headers, header=0)
df['filename'] = df['fullname'].str.split('/').str[-1]

# Extract type, id, and model
df[['type', 'id', 'model']] = df['filename'].str.extract(r'(\w+)_(\d+)@(\w+)')
#measures_to_compare = unique[1::5][:50]
filtered_measures = [measure for measure in df['Measures'] if ('(par' or '-par') not in measure]
print(filtered_measures[:15])
# Prepare DataFrames for comparison
df = df[df['Measures'].isin(filtered_measures)]
grouped2 = df.groupby(['type', 'model', 'Measures'])


['Letter count (Sent Avg)', 'Letter count (Sent SD)', 'Letter count (Sent Max)', 'Letter count (Sent Min)', 'Word count (Doc)', 'Word count (Sent Avg)', 'Word count (Sent SD)', 'Word count (Sent Max)', 'Word count (Sent Min)', 'Type count (Doc)', 'Type count (Sent Avg)', 'Type count (Sent SD)', 'Type count (Sent Max)', 'Type count (Sent Min)', 'Sentence count (Doc)']


In [68]:
pivot_df = df.pivot(index=['id', 'model', 'type'], columns='Measures', values='Amount').reset_index()
print(pivot_df[:6])

Measures    id    model  type  Adjective avg position (Doc)  \
0         0007     Meta  acad                      0.496633   
1         0007     Meta   fic                      0.526100   
2         0007  chunk_1  acad                      0.350858   
3         0007  chunk_1   fic                      0.536830   
4         0007  chunk_2  acad                      0.373590   
5         0007  chunk_2   fic                      0.533355   

Measures  Adjective avg position (Sent Avg)  \
0                                  0.375528   
1                                  0.311252   
2                                  0.305196   
3                                  0.396934   
4                                  0.426357   
5                                  0.473503   

Measures  Adjective avg position (Sent Max)  \
0                                  0.833333   
1                                  1.000000   
2                                  0.892857   
3                                  0.677

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

outcomes = df['model']
# Correlation-based filtering
corri = df.copy()
corri = corri.drop(columns=['model', 'id', 'type', 'fullname', 'Level', 'Function', 'filename'])
print(corri[:6])
correlation_matrix = corri.corr()
high_corr = [col for col in correlation_matrix if any(correlation_matrix[col] > 0.9)]

# Remove highly correlated features
data_filtered = df.drop(columns=high_corr)

# ANOVA Test for feature selection
selector = SelectKBest(score_func=f_classif, k=100)  # Select top 100 features
selected_features = selector.fit_transform(data_filtered, outcomes)

# Model-based feature selection
model = RandomForestClassifier()
rfe = RFE(model, n_features_to_select=100)
rfe_features = rfe.fit_transform(data_filtered, outcomes)


                                             fullname  \
4   texts_2024_1120_results/chunk_1_acad/acad_0007...   
5   texts_2024_1120_results/chunk_1_acad/acad_0007...   
6   texts_2024_1120_results/chunk_1_acad/acad_0007...   
7   texts_2024_1120_results/chunk_1_acad/acad_0007...   
8   texts_2024_1120_results/chunk_1_acad/acad_0007...   
13  texts_2024_1120_results/chunk_1_acad/acad_0007...   

                   Measures      Amount     Level Function  type    id  \
4   Letter count (Sent Avg)  170.687500  sentence  average  acad  0007   
5    Letter count (Sent SD)   66.316131  sentence    stdev  acad  0007   
6   Letter count (Sent Max)  323.000000  sentence  maximum  acad  0007   
7   Letter count (Sent Min)   85.000000  sentence  minimum  acad  0007   
8          Word count (Doc)  509.000000  document      NaN  acad  0007   
13    Word count (Sent Avg)   31.812500  sentence  average  acad  0007   

      model  
4   chunk_1  
5   chunk_1  
6   chunk_1  
7   chunk_1  
8   chunk_1

ValueError: could not convert string to float: 'texts_2024_1120_results/chunk_1_acad/acad_0007@chunk_1.csv'

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
X = pivot_df.drop(columns=['model'])
y = pivot_df['model']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.65
Classification Report:
               precision    recall  f1-score   support

        Meta       0.91      0.77      0.83        13
     chunk_1       0.36      0.44      0.40         9
     chunk_2       0.45      0.45      0.45        11
         gpt       1.00      1.00      1.00         7

    accuracy                           0.65        40
   macro avg       0.68      0.67      0.67        40
weighted avg       0.68      0.65      0.66        40



In [46]:
filtered_measures2 = [measure for measure in df['Measures'] if '(Doc)' in measure]
df = df[df['type'] == 'acad']
print(filtered_measures2[:15])
dfDoc = df[df['Measures'].isin(filtered_measures2)]
grouped2 = df.groupby(['type', 'model', 'Measures'])

# Calculate the mean for each feature in the group
mean_features2 = grouped2['Amount'].mean()

# Print the results
print(mean_features2)

['Word count (Doc)', 'Type count (Doc)', 'Sentence count (Doc)', 'Paragraph count (Doc)', 'Hapax legomena count (Doc)', 'Hapax legomena incidence (Doc)', 'Adjective count (Doc)', 'Adverb count (Doc)', 'Interjection count (Doc)', 'Lexical verb count (Doc)', 'Noun count (Doc)', 'Proper noun count (Doc)', 'Lexical item count (Doc)', 'Adposition count (Doc)', 'Auxiliary count (Doc)']
type  model  Measures                         
acad  Meta   Adjective avg position (Doc)         0.490135
             Adjective avg position (Sent Avg)    0.392278
             Adjective avg position (Sent Max)    0.819249
             Adjective avg position (Sent Min)    0.048171
             Adjective avg position (Sent SD)     0.254250
                                                    ...   
      gpt    Zipf frequency (Sent Max)            5.464279
             Zipf frequency (Sent Min)            4.071200
             Zipf frequency (Sent SD)             0.379062
             Zipf goodness of fit (Doc)

In [21]:
grouped = df.groupby(['type', 'model', 'Measures'])

# Calculate the mean for each feature in the group
mean_features = grouped['Amount'].mean()

# Print the results
print(mean_features)


type  model  Measures                         
acad  Meta   Adjective avg position (Doc)         0.490135
             Adjective avg position (Sent Avg)    0.392278
             Adjective avg position (Sent Max)    0.819249
             Adjective avg position (Sent Min)    0.048171
             Adjective avg position (Sent SD)     0.254250
                                                    ...   
fic   gpt    Zipf frequency (Sent Max)            6.061212
             Zipf frequency (Sent Min)            4.094123
             Zipf frequency (Sent SD)             0.444964
             Zipf goodness of fit (Doc)           0.945431
             Zipf steepness of curve (Doc)        0.677113
Name: Amount, Length: 14224, dtype: float64


In [23]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df is your DataFrame
# Filter data for chunk_1 and other models
chunk_1_data = df[mean_features['model'] == 'chunk_1']
other_models_data = df[df['model'] != 'chunk_1']

# Merge the data on 'type' and 'Measures'
merged_data = pd.merge(chunk_1_data, other_models_data, on=['type', 'Measures'], suffixes=('_chunk_1', '_other'))

# Calculate the difference
merged_data['Difference'] = merged_data['Amount_chunk_1'] - merged_data['Amount_other']

# Plot the differences
plt.figure(figsize=(14, 8))
sns.barplot(data=merged_data, x='Measures', y='Difference', hue='type')
plt.xticks(rotation=90)
plt.title('Differences in Measures between chunk_1 and Other Models')
plt.xlabel('Measures')
plt.ylabel('Difference')
plt.legend(title='Type')
plt.tight_layout()
plt.show()

TypeError: 'DataFrameGroupBy' object does not support item assignment