In [1]:
import os
import pandas as pd
import numpy as np
import requests
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
import sqlalchemy 
from sqlalchemy import create_engine, text
import matplotlib.pyplot as plt
import matplotlib as mpl
import chardet
import xlrd
import re

In [2]:
file_path = '/Users/goranskejo/Desktop/Projekt Rundata/cleaned_Period_Datering_rundata.xlsx'
df = pd.read_excel(file_path)

In [3]:
# Count the number of unique values in the columns 'Period/Datering' and 'Cleaned_Period/Datering'
Period_Datering_unique_values_count = df['Period/Datering'].nunique()

print(f"Number of unique values in 'Period/Datering': {Period_Datering_unique_values_count}")

Number of unique values in 'Period/Datering': 20


In [4]:
list_unique_dates = list(df['Period/Datering'].unique())
list_unique_dates.sort()
list_unique_dates

['Allmän/osäker sentida',
 'Folkvandringstid 160–375',
 'Högmedeltid 1150–1300',
 'Mellanvendeltid 600–700',
 'Modern tid 1500–1700',
 'Modern tid 1700–1800',
 'Modern tid 1800–1900',
 'Oklar/allmän vendeltid',
 'Okänd',
 'Osäker/allmän medeltida',
 'Sen vikingatid 1050-1200',
 'Senmedeltid 1300–1500',
 'Sent folkvandringstid 375–500',
 'Sent vendeltid 700–800',
 'Tidig medeltid 1000–1150',
 'Tidig vendeltid 500–600',
 'Tidig vikingatiden 700-800',
 'Vikingatiden 800-1050',
 'Vikingatiden 800–1050',
 'unknown']

In [7]:
# Define unique numeric and non-numeric values (normalized to match DataFrame)
numeric_values = [
    'folkvandringstid 160–375',
    'högmedeltid 1150–1300',
    'mellanvendeltid 600–700',
    'modern tid 1500–1700',
    'modern tid 1700–1800',
    'modern tid 1800–1900',
    'sen vikingatid 1050-1200',
    'senmedeltid 1300–1500',
    'sent folkvandringstid 375–500',
    'sent vendeltid 700–800',
    'tidig medeltid 1000–1150',
    'tidig vendeltid 500–600',
    'tidig vikingatiden 700-800',
    'vikingatiden 800-1050',
    'vikingatiden 800–1050'
]

non_numeric_values = [
    'allmän/osäker sentida',
    'oklar/allmän vendeltid',
    'okänd',
    'osäker/allmän medeltida',
    'unknown'
]

# Normalize the lists to lowercase for consistent comparison
numeric_values = [value.lower() for value in numeric_values]
non_numeric_values = [value.lower() for value in non_numeric_values]

# Convert the DataFrame values to lowercase for comparison
df['Period/Datering'] = df['Period/Datering'].str.lower()

# Filter the DataFrame for training and testing
train_mask = df['Period/Datering'].isin(numeric_values)
test_mask = df['Period/Datering'].isin(non_numeric_values)

# Print counts of training and testing sets
print(f"Number of training samples: {train_mask.sum()}")
print(f"Number of testing samples: {test_mask.sum()}")

# Proceed only if we have samples for training and testing
if train_mask.sum() > 0 and test_mask.sum() > 0:
    X_train = df[train_mask][['Kommun', 'Koordinater', 'Materialtyp', 'Föremål']]
    y_train = df[train_mask]['Period/Datering']
    X_test = df[test_mask][['Kommun', 'Koordinater', 'Materialtyp', 'Föremål']]

    # Encode categorical features
    label_encoders = {}
    for column in X_train.select_dtypes(include=['object']).columns:
        encoder = LabelEncoder()
        X_train[column] = encoder.fit_transform(X_train[column])
        label_encoders[column] = encoder
        # Apply the same encoding to the test set
        X_test[column] = X_test[column].apply(lambda x: encoder.transform([x])[0] if x in encoder.classes_ else np.nan)

    # Impute missing values in the features
    imputer = SimpleImputer(strategy='most_frequent')
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Predict the missing values
    predicted_values = model.predict(X_test)

    # Create a new column in the original DataFrame for the predicted values
    df.loc[test_mask, 'Predicted_Period/Datering'] = predicted_values

    # Save the updated DataFrame to Excel
    output_path = '/Users/goranskejo/Desktop/Projekt Rundata/Rundata_period_updated.xlsx'  # Specify your desired path
    df.to_excel(output_path, index=False)

    # Optional: View the updated DataFrame in console
    print(df[['Period/Datering', 'Predicted_Period/Datering']])
else:
    print("No valid samples found for training or testing.")

Number of training samples: 4984
Number of testing samples: 6687
             Period/Datering Predicted_Period/Datering
0      allmän/osäker sentida     vikingatiden 800–1050
1      allmän/osäker sentida     senmedeltid 1300–1500
2      allmän/osäker sentida     vikingatiden 800-1050
3      allmän/osäker sentida     vikingatiden 800-1050
4      allmän/osäker sentida     vikingatiden 800-1050
...                      ...                       ...
11666  vikingatiden 800–1050                       NaN
11667  vikingatiden 800–1050                       NaN
11668  vikingatiden 800–1050                       NaN
11669  vikingatiden 800–1050                       NaN
11670  vikingatiden 800–1050                       NaN

[11671 rows x 2 columns]
