In [29]:
import os
import pandas as pd
import numpy as np
import requests
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
import sqlalchemy 
from sqlalchemy import create_engine, text
import matplotlib.pyplot as plt
import matplotlib as mpl
import chardet
import xlrd
import re

In [30]:
file_path = '/Users/goranskejo/Desktop/Projekt Rundata/rundata.xlsx'
df = pd.read_excel(file_path)

In [14]:
def clean_date(date_string):
    if isinstance(date_string, str):  # Check if the value is a string
        # Extract the first sequence of 3 or 4 digits, possibly followed by a dash and another sequence of digits
        clean_date = re.search(r'\d{3,4}(-\d{3,4})?', date_string)
        if clean_date:
            return clean_date.group(0)  # Return the cleaned date (e.g., '1100-1150' or '900')
    return None  # If no valid date format is found, return None

# Apply the cleaning function to the 'Period_Datering' column
df['Cleaned_Period/Datering'] = df['Period/Datering'].apply(clean_date)

# View the cleaned data
print(df[['Period/Datering', 'Cleaned_Period/Datering']])

# Save the updated DataFrame to the same Excel file
df.to_excel(file_path, index=False)

print("Cleaned data has been saved to the Excel file.")

      Period/Datering Cleaned_Period/Datering
0           V s 900-t                     900
1                   V                    None
2        V efter 1050                    1050
3        V efter 1050                    1050
4                   V                    None
...               ...                     ...
11666             NaN                    None
11667             NaN                    None
11668             NaN                    None
11669             NaN                    None
11670             NaN                    None

[11671 rows x 2 columns]
Cleaned data has been saved to the Excel file.


In [15]:
# Count the number of unique values in the columns 'Period/Datering' and 'Cleaned_Period/Datering'
Period_Datering_unique_values_count = df['Period/Datering'].nunique()
Cleaned_Period_Datering_unique_values_count = df['Cleaned_Period/Datering'].nunique()

print(f"Number of unique values in 'Period/Datering': {Period_Datering_unique_values_count}")
print(f"Number of unique values in 'Cleaned_Period/Datering': {Cleaned_Period_Datering_unique_values_count}")

Number of unique values in 'Period/Datering': 587
Number of unique values in 'Cleaned_Period/Datering': 241


In [33]:
X = df[['Kommun', 'Koordinater', 'Materialtyp', 'Föremål']]
y = df['Cleaned_Period/Datering']

# Function to convert period strings to categorical labels
def convert_to_label(date_string):
    if isinstance(date_string, str):
        return date_string  # Keep it as string for categorical encoding
    return None

# Convert the target variable to categorical
y_categorical = y.apply(convert_to_label)

# Separate the training data (non-null values) and test data (null values)
train_mask = y_categorical.notnull()
test_mask = y_categorical.isnull()

X_train = X[train_mask]
y_train = y_categorical[train_mask]
X_test = X[test_mask]

# Initialize a LabelEncoder for each categorical feature
encoders = {}
for column in X_train.select_dtypes(include=['object']).columns:
    encoder = LabelEncoder()
    X_train[column] = encoder.fit_transform(X_train[column])
    encoders[column] = encoder
    # Encode the test set, handle unseen labels
    X_test[column] = X_test[column].apply(lambda x: encoder.transform([x])[0] if x in encoder.classes_ else np.nan)

# Impute missing values in the features
imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust n_estimators
model.fit(X_train, y_train)

# Predict the missing values
predicted_values = model.predict(X_test)

# Create a new column in the original DataFrame for the predicted values
df.loc[test_mask, 'Predicted_Period/Datering'] = predicted_values

# Save the updated DataFrame to the Excel file
output_path = '/Users/goranskejo/Desktop/Projekt Rundata/rundata_updated.xlsx'  # Specify your desired path
df.to_excel(output_path, index=False)

# View the updated DataFrame in console
print(df[['Cleaned_Period/Datering', 'Predicted_Period/Datering']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[column] = encoder.fit_transform(X_train[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[column] = X_test[column].apply(lambda x: encoder.transform([x])[0] if x in encoder.classes_ else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[column] = encoder.fit_transf

      Cleaned_Period/Datering Predicted_Period/Datering
0                         900                       NaN
1                         NaN                       375
2                        1050                       NaN
3                        1050                       NaN
4                         NaN                       375
...                       ...                       ...
11666                     NaN                      1000
11667                     NaN                      1000
11668                     NaN                      1000
11669                     NaN                      1000
11670                     NaN                      1000

[11671 rows x 2 columns]
