In [1]:
import os
import pandas as pd
import numpy as np
import requests
from io import StringIO
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sqlalchemy import create_engine, text
import matplotlib.pyplot as plt
import matplotlib as mpl
import chardet
import re

In [2]:
server = 'localhost'
database = 'Rundata'
driver = 'ODBC Driver 17 for SQL Server'
connection_string = f"mssql+pyodbc://{server}/{database}?driver={driver}"
engine = create_engine(connection_string)

with engine.connect() as connection:
    result = connection.execute(text('SELECT TOP 5 * FROM Variablerna'))    
    for row in result:
        print(row)

('Mörbylånga', '6275755.1538971', 'smålandsporfyr', 'sten', 'runsten', 'V s 900-t', None, 1)
('Mörbylånga', '6283750.1544290', None, 'sten', 'runsten', 'V', None, 2)
('Mörbylånga', '6268390.1539280', None, 'sten', 'fragment av runsten', 'V efter 1050', None, 3)
('Mörbylånga', '6268390.1539280', 'grå kalksten', 'sten', 'runsten', 'V efter 1050', None, 4)
('Mörbylånga', '6264000.1538250', None, 'sten', 'runsten', 'V', None, 5)


In [3]:
# Load the data
with engine.connect() as connection:
    df = pd.read_sql(text('SELECT Kommun, Koordinater, Materialtyp, Föremål, Period_Datering FROM Variablerna'), connection)

    # Kolumnens namn ändrat från Period/Datering till Period_Datering (också i SQL databasen). Annars tror programmet att vi 
    # menar antingen Period eller Datering och vi får ett error att det inte finns någon kolumn som heter Period eller Datering.
    
# Check the data types
print(df.dtypes)

# Encode the categorical data (if 'Kommun' and 'Föremål' are categorical)
encoder_kommun = LabelEncoder()
encoder_koordinater = LabelEncoder()
encoder_materialtyp = LabelEncoder()
encoder_foremal = LabelEncoder()

    # Jag gör koordinatvärdena till kategorisk data, så att varje värde behandlas som en separat kategori. Annars behandlar
    # programmet värdena som floats, vilket orsakar ValueError: could not convert string to float: '(63.4543 ; 10.9549)'.

df['Kommun'] = encoder_kommun.fit_transform(df['Kommun'])
df['Koordinater'] = encoder_koordinater.fit_transform(df['Koordinater']) 
df['Materialtyp'] = encoder_materialtyp.fit_transform(df['Materialtyp'])
df['Föremål'] = encoder_foremal.fit_transform(df['Föremål'])

df = df.dropna()  # Drop rows with missing values

Kommun             object
Koordinater        object
Materialtyp        object
Föremål            object
Period_Datering    object
dtype: object


In [4]:
# Define features (X) and target (y)
X = df[['Kommun', 'Koordinater', 'Materialtyp', 'Föremål']]
y = df['Period_Datering']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Train a Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred_logreg = logreg.predict(X_test)

# Evaluate Logistic Regression
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Logistic Regression Report:\n", classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.43925925925925924
Logistic Regression Report:
                                         precision    recall  f1-score   support

                                     ?       0.00      0.00      0.00         5
                                     M       0.38      0.39      0.39       303
                           M 1050-1150       0.00      0.00      0.00         1
                           M 1100-1150       0.00      0.00      0.00         1
    M 1100-1150 (arkeologisk datering)       0.00      0.00      0.00         1
                         M 1100-1200-t       0.00      0.00      0.00         3
                           M 1100-1250       0.00      0.00      0.00         2
                           M 1100-1300       0.00      0.00      0.00         1
                              M 1100-t       0.00      0.00      0.00        23
                   M 1100-t - b 1200-t       0.00      0.00      0.00         1
                    M 1100-talets mitt  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# Train a Support Vector Classifier (SVC)
svc = SVC()
svc.fit(X_train, y_train)

# Predict on the test set
y_pred_svc = svc.predict(X_test)

# Evaluate SVC
print("SVC Accuracy:", accuracy_score(y_test, y_pred_svc))
print("SVC Report:\n", classification_report(y_test, y_pred_svc))

SVC Accuracy: 0.5592592592592592
SVC Report:
                                         precision    recall  f1-score   support

                                     ?       0.00      0.00      0.00         5
                                     M       0.43      0.86      0.58       303
                           M 1050-1150       0.00      0.00      0.00         1
                           M 1100-1150       0.00      0.00      0.00         1
    M 1100-1150 (arkeologisk datering)       0.00      0.00      0.00         1
                         M 1100-1200-t       0.00      0.00      0.00         3
                           M 1100-1250       0.00      0.00      0.00         2
                           M 1100-1300       0.00      0.00      0.00         1
                              M 1100-t       0.00      0.00      0.00        23
                   M 1100-t - b 1200-t       0.00      0.00      0.00         1
                    M 1100-talets mitt       0.00      0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
print(df['Period_Datering'].value_counts())

# Checking if the class distribution is skewed (if some classes have very few or no samples).

Period_Datering
V                                       5620
M                                       2998
V 1065-1075                              256
U 400-650                                208
V efter-Jelling                          180
                                        ... 
M s 1100-t - 1361                          2
M s 1200-t - omkr. 1300                    2
M omkr. 1300                               2
M 1050-1361, troligen 1300-t               2
M s 1200-t (konsthistorisk datering)       2
Name: count, Length: 587, dtype: int64


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Create the classifier with class_weight balanced
model = RandomForestClassifier(class_weight='balanced', random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate the classification report with zero_division handled
print(classification_report(y_test, y_pred, zero_division=0))

                                                                  precision    recall  f1-score   support

                                                               ?       0.64      0.54      0.58        13
                                                               M       0.92      0.67      0.78       591
                                               M 1100 - b 1200-t       0.43      1.00      0.60         3
                                                     M 1100-1150       1.00      1.00      1.00         2
                              M 1100-1150 (arkeologisk datering)       1.00      1.00      1.00         2
                                   M 1100-1150 (dendrokronologi)       1.00      1.00      1.00         1
                                                     M 1100-1175       0.00      0.00      0.00         1
                                                     M 1100-1200       0.00      0.00      0.00         2
                                             