In [33]:
import pandas as pd
import numpy as np
import import_ipynb
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


In [36]:
data_file = 'dataset/knotinfo_data_complete.xls'
knot_info = pd.read_excel(data_file, skiprows=[1])

In [37]:
def convert_entry_to_int(entry):
    try:
        return int(entry)
    except:
        if entry == 'Y':
            return 1
        elif entry == 'N':
            return 0
        return None

In [35]:
features = [
    'crossing_number', 
    'three_genus',
    'bridge_index',
    'braid_index', 
    'signature', 
    'smooth_four_genus',
    'topological_four_genus',
#     'turaev_genus', 
    'determinant',
    'alternating'
]

# the moment I throw-in the 'turaev-genus' the score for the tests goes to 1

In [45]:
def predict_alternating(features, r_state):
    dataframe = knot_info[features].applymap(convert_entry_to_int).dropna()
    X = dataframe.drop('alternating', axis=1)
    Y = dataframe['alternating']
    # max_iter needs to be atleast 500 for convergence 
    return LogisticRegression(random_state = r_state, max_iter = 500).fit(X, Y)

In [54]:
df = knot_info[features].applymap(convert_entry_to_int).dropna()
X = df.drop('alternating', axis = 1)
Y = df['alternating']

theta = predict_alternating(features, 3000)

print(theta.coef_)


[[-3.08368041  1.47153871 -4.50494608  1.51621302  0.14794177  0.40066507
   0.80577468  0.06376554]]


In [42]:
scores = cross_val_score(theta, X, Y, cv=500)
print(scores)
print(scores.mean())

[0.83333333 0.83333333 0.83333333 0.83333333 0.66666667 0.83333333
 1.         1.         1.         1.         1.         1.
 0.83333333 1.         1.         1.         0.83333333 1.
 1.         0.83333333 1.         1.         0.83333333 1.
 1.         1.         1.         1.         1.         1.
 0.83333333 1.         1.         0.83333333 0.83333333 1.
 1.         1.         1.         1.         1.         1.
 1.         0.83333333 1.         1.         1.         0.83333333
 1.         1.         1.         1.         1.         1.
 0.83333333 1.         1.         1.         1.         1.
 0.66666667 1.         1.         0.66666667 1.         1.
 1.         1.         1.         1.         1.         1.
 1.         0.83333333 0.83333333 1.         1.         1.
 1.         1.         0.83333333 1.         1.         0.83333333
 1.         1.         1.         1.         1.         1.
 0.83333333 1.         1.         1.         1.         1.
 1.         1.         1.       

In [41]:
#percentage of alternating knots in usable dataset: 62.20127903062942%

In [None]:
# random state = 100 : [[-3.08368041  1.47153871 -4.50494608  1.51621302  0.14794177  0.40066507 0.80577468  0.06376554]]
# random state = 200 : [[-3.08368041  1.47153871 -4.50494608  1.51621302  0.14794177  0.40066507 0.80577468  0.06376554]]
# random state = 300 : [[-3.08368041  1.47153871 -4.50494608  1.51621302  0.14794177  0.40066507 0.80577468  0.06376554]]
# random state = 400 : [[-3.08368041  1.47153871 -4.50494608  1.51621302  0.14794177  0.40066507 0.80577468  0.06376554]]
# random state = 900 : [[-3.08368041  1.47153871 -4.50494608  1.51621302  0.14794177  0.40066507 0.80577468  0.06376554]]

# 