In [37]:
import pickle
from collections import deque
from enum import Enum
from cursor import *
import re
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
train_text_path = "/Users/lkk/Documents/BOUN CMPE/CMPE 561-Natural Language Processing/Application Project 1/corpora/UD_Turkish-BOUN/tr_boun-ud-train.txt"
# Load the train text file
with open(train_text_path, "r", encoding="utf-8") as file:
    text = file.read()

In [3]:
tokens_path = "token_list_ud_boun.pkl"
# Load the token list corresponding to the train text file
with open("token_list_ud_boun.pkl", "rb") as file:
    tokens = pickle.load(file)

In [4]:
class Patterns(Enum):
    WHITESPACE = re.compile(r"\s")
    UPPER_ALPHABETICAL = re.compile(r"[ÇĞİÖŞÜA-Z]")
    LOWER_ALPHABETICAL = re.compile(r"[ûâçğıöşüa-z]")
    NUMBER = re.compile(r"\d")
    PERIOD = re.compile(r"\.")
    APOSTROPHE = re.compile(r"\'")
    OTHER_EOS = re.compile(r'[\!\?…]')

In [5]:
def analyzeEachCursorPosition(text, tokens):
    # Build a linked list from the token list
    tokenLinkedList = deque(tokens)
    # Helper function to get the next token from the linked list
    def getNextToken():
        return tokenLinkedList.popleft() if len(tokenLinkedList) > 0 else None
    
    # Next token in the text
    nextToken = getNextToken()
    # List to store all cursor objects
    cursors = []
    
    # Variables for calculating numerical (distance) features of the cursor
    lastWhitespacePosition = None
    lastUpperAlphabeticalPosition = None
    lastLowerAlphabeticalPosition = None
    lastNumberPosition = None
    lastPeriodPosition = None
    lastApostrophePosition = None
    lastOtherEOSPosition = None
    lastOtherPosition = None
    
    # For each cursor position in the text (if text is of size N, it has N+1 cursor positions)
    for i in range(len(text)+1):
        # Create cursor object
        cursor = Cursor(i)
        
        # Check for numerical features (which are limited to a maximum distance of 100)
        cursor.numericalFeatures["distanceToLeftWhitespace"] = min(i - lastWhitespacePosition, 100) if lastWhitespacePosition is not None else 100
        cursor.numericalFeatures["distanceToLeftUpperAlphabetical"] = min(i - lastUpperAlphabeticalPosition, 100) if lastUpperAlphabeticalPosition is not None else 100
        cursor.numericalFeatures["distanceToLeftLowerAlphabetical"] = min(i - lastLowerAlphabeticalPosition, 100) if lastLowerAlphabeticalPosition is not None else 100
        cursor.numericalFeatures["distanceToLeftNumber"] = min(i - lastNumberPosition, 100) if lastNumberPosition is not None else 100
        cursor.numericalFeatures["distanceToLeftPeriod"] = min(i - lastPeriodPosition, 100) if lastPeriodPosition is not None else 100
        cursor.numericalFeatures["distanceToLeftApostrophe"] = min(i - lastApostrophePosition, 100) if lastApostrophePosition is not None else 100
        cursor.numericalFeatures["distanceToLeftOtherEOS"] = min(i - lastOtherEOSPosition, 100) if lastOtherEOSPosition is not None else 100
        cursor.numericalFeatures["distanceToLeftOther"] = min(i - lastOtherPosition, 100) if lastOtherPosition is not None else 100
        
        # Check for binary features of the char to the left of the current cursor position
        if i < 1:
            cursor.leftCharBinaryFeatures["isLeftWhitespace"] = 1
        else:
            if Patterns.WHITESPACE.value.match(text[i-1]) is not None:
                cursor.leftCharBinaryFeatures["isLeftWhitespace"] = 1
            elif Patterns.UPPER_ALPHABETICAL.value.match(text[i-1]) is not None:
                cursor.leftCharBinaryFeatures["isLeftUpperAlphabetical"] = 1
            elif Patterns.LOWER_ALPHABETICAL.value.match(text[i-1]) is not None:
                cursor.leftCharBinaryFeatures["isLeftLowerAlphabetical"] = 1
            elif Patterns.NUMBER.value.match(text[i-1]) is not None:
                cursor.leftCharBinaryFeatures["isLeftNumber"] = 1
            elif Patterns.PERIOD.value.match(text[i-1]) is not None:
                cursor.leftCharBinaryFeatures["isLeftPeriod"] = 1
            elif Patterns.APOSTROPHE.value.match(text[i-1]) is not None:
                cursor.leftCharBinaryFeatures["isLeftApostrophe"] = 1
            elif Patterns.OTHER_EOS.value.match(text[i-1]) is not None:
                cursor.leftCharBinaryFeatures["isLeftOtherEOS"] = 1
            else:
                cursor.leftCharBinaryFeatures["isLeftOther"] = 1
            
        # Check for binary features of the char to the right of the current cursor position
        if i > len(text) - 1:
            cursor.rightCharBinaryFeatures["isRightWhitespace"] = 1
        else:
            if Patterns.WHITESPACE.value.match(text[i]) is not None:
                cursor.rightCharBinaryFeatures["isRightWhitespace"] = 1
                lastWhitespacePosition = i
            elif Patterns.UPPER_ALPHABETICAL.value.match(text[i]) is not None:
                cursor.rightCharBinaryFeatures["isRightUpperAlphabetical"] = 1
                lastUpperAlphabeticalPosition = i
            elif Patterns.LOWER_ALPHABETICAL.value.match(text[i]) is not None:
                cursor.rightCharBinaryFeatures["isRightLowerAlphabetical"] = 1
                lastLowerAlphabeticalPosition = i
            elif Patterns.NUMBER.value.match(text[i]) is not None:
                cursor.rightCharBinaryFeatures["isRightNumber"] = 1
                lastNumberPosition = i
            elif Patterns.PERIOD.value.match(text[i]) is not None:
                cursor.rightCharBinaryFeatures["isRightPeriod"] = 1
                lastPeriodPosition = i
            elif Patterns.APOSTROPHE.value.match(text[i]) is not None:
                cursor.rightCharBinaryFeatures["isRightApostrophe"] = 1
                lastApostrophePosition = i
            elif Patterns.OTHER_EOS.value.match(text[i]) is not None:
                cursor.rightCharBinaryFeatures["isRightOtherEOS"] = 1
                lastOtherEOSPosition = i
            else:
                cursor.rightCharBinaryFeatures["isRightOther"] = 1
                lastOtherPosition = i
            
        # # Check for bigram features
        # if (i > 1) and (i < len(text) - 1):
        #     cursor.bigramFeatures["bigramLeft"] = text[i-2:i]
        #     cursor.bigramFeatures["bigramRight"] = text[i:i+2]
        # else:
        #     if i == 0:
        #         cursor.bigramFeatures["bigramLeft"] = ". "
        #         cursor.bigramFeatures["bigramRight"] = text[i:i+2]
        #     elif i == 1:
        #         cursor.bigramFeatures["bigramLeft"] = " " + text[i-1:i]
        #         cursor.bigramFeatures["bigramRight"] = text[i:i+2]
        #     elif i == len(text) - 1:
        #         cursor.bigramFeatures["bigramLeft"] = text[i-2:i]
        #         cursor.bigramFeatures["bigramRight"] = text[i:i+1] + " "
        #     elif i == len(text):
        #         cursor.bigramFeatures["bigramLeft"] = text[i-2:i]
        #         cursor.bigramFeatures["bigramRight"] = " ."
                
        # Check for the label (if current cursor position is the start of a new token or not)
        if nextToken is not None:
            if text[i: i+len(nextToken)] == nextToken:
                cursor.label = 1
                nextToken = getNextToken()
        else:
            if i == len(text):
                cursor.label = 1
        
        cursors.append(cursor)
            
    return cursors


In [6]:
cursors = analyzeEachCursorPosition(text, tokens)

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [8]:
# Create a list of dictionaries for the DataFrame
indices = []
leftCharFeatures = []
rightCharFeatures = []
numericalFeatures = []
# bigramFeatures = []
labels = []


for cursor in cursors:
    indices.append(cursor.position)  # Collect the index (self.position)
    leftCharFeatures.append(cursor.leftCharBinaryFeatures)  # Collect the feature dictionary
    rightCharFeatures.append(cursor.rightCharBinaryFeatures)
    numericalFeatures.append(cursor.numericalFeatures)
    # bigramFeatures.append(cursor.bigramFeatures)
    labels.append(cursor.label)

# Create the DataFrame
dfLeft = pd.DataFrame(leftCharFeatures, index=indices)
dfRight = pd.DataFrame(rightCharFeatures, index=indices)
dfNum = pd.DataFrame(numericalFeatures, index=indices)
# dfBi = pd.DataFrame(bigramFeatures, index=indices)
dfLabels = pd.DataFrame(labels, index=indices)

In [9]:
dfLeft.dtypes

isLeftWhitespace           int64
isLeftUpperAlphabetical    int64
isLeftLowerAlphabetical    int64
isLeftNumber               int64
isLeftPeriod               int64
isLeftApostrophe           int64
isLeftOtherEOS             int64
isLeftOther                int64
dtype: object

In [10]:
dfRight.dtypes

isRightWhitespace           int64
isRightUpperAlphabetical    int64
isRightLowerAlphabetical    int64
isRightNumber               int64
isRightPeriod               int64
isRightApostrophe           int64
isRightOtherEOS             int64
isRightOther                int64
dtype: object

In [11]:
dfNum

Unnamed: 0,distanceToLeftWhitespace,distanceToLeftUpperAlphabetical,distanceToLeftLowerAlphabetical,distanceToLeftNumber,distanceToLeftPeriod,distanceToLeftApostrophe,distanceToLeftOtherEOS,distanceToLeftOther
0,100,100,100,100,100,100,100,100
1,100,100,100,1,100,100,100,100
2,100,100,100,1,100,100,100,100
3,100,100,100,1,100,100,100,100
4,100,100,100,1,100,100,100,100
...,...,...,...,...,...,...,...,...
616494,5,60,1,100,62,100,100,100
616495,6,61,1,100,63,100,100,100
616496,7,62,2,100,1,100,100,100
616497,1,63,3,100,2,100,100,100


In [12]:
scaler = MinMaxScaler()
scaledValues = scaler.fit_transform(dfNum)
dfNumScaled = pd.DataFrame(scaledValues, columns=dfNum.columns)

In [13]:
dfNumScaled

Unnamed: 0,distanceToLeftWhitespace,distanceToLeftUpperAlphabetical,distanceToLeftLowerAlphabetical,distanceToLeftNumber,distanceToLeftPeriod,distanceToLeftApostrophe,distanceToLeftOtherEOS,distanceToLeftOther
0,1.000000,1.000000,1.000000,1.0,1.000000,1.0,1.0,1.0
1,1.000000,1.000000,1.000000,0.0,1.000000,1.0,1.0,1.0
2,1.000000,1.000000,1.000000,0.0,1.000000,1.0,1.0,1.0
3,1.000000,1.000000,1.000000,0.0,1.000000,1.0,1.0,1.0
4,1.000000,1.000000,1.000000,0.0,1.000000,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...
616494,0.040404,0.595960,0.000000,1.0,0.616162,1.0,1.0,1.0
616495,0.050505,0.606061,0.000000,1.0,0.626263,1.0,1.0,1.0
616496,0.060606,0.616162,0.010101,1.0,0.000000,1.0,1.0,1.0
616497,0.000000,0.626263,0.020202,1.0,0.010101,1.0,1.0,1.0


In [14]:
dfNumScaled.dtypes

distanceToLeftWhitespace           float64
distanceToLeftUpperAlphabetical    float64
distanceToLeftLowerAlphabetical    float64
distanceToLeftNumber               float64
distanceToLeftPeriod               float64
distanceToLeftApostrophe           float64
distanceToLeftOtherEOS             float64
distanceToLeftOther                float64
dtype: object

In [15]:
dfLabels

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0
...,...
616494,0
616495,1
616496,0
616497,0


In [16]:
dfLabels.dtypes

0    int64
dtype: object

In [17]:
dfAllFeatures = pd.concat([dfLeft, dfRight, dfNumScaled], axis=1)

In [18]:
dfAllFeatures.dtypes

isLeftWhitespace                     int64
isLeftUpperAlphabetical              int64
isLeftLowerAlphabetical              int64
isLeftNumber                         int64
isLeftPeriod                         int64
isLeftApostrophe                     int64
isLeftOtherEOS                       int64
isLeftOther                          int64
isRightWhitespace                    int64
isRightUpperAlphabetical             int64
isRightLowerAlphabetical             int64
isRightNumber                        int64
isRightPeriod                        int64
isRightApostrophe                    int64
isRightOtherEOS                      int64
isRightOther                         int64
distanceToLeftWhitespace           float64
distanceToLeftUpperAlphabetical    float64
distanceToLeftLowerAlphabetical    float64
distanceToLeftNumber               float64
distanceToLeftPeriod               float64
distanceToLeftApostrophe           float64
distanceToLeftOtherEOS             float64
distanceToL

In [19]:
dfAllFeatures

Unnamed: 0,isLeftWhitespace,isLeftUpperAlphabetical,isLeftLowerAlphabetical,isLeftNumber,isLeftPeriod,isLeftApostrophe,isLeftOtherEOS,isLeftOther,isRightWhitespace,isRightUpperAlphabetical,...,isRightOtherEOS,isRightOther,distanceToLeftWhitespace,distanceToLeftUpperAlphabetical,distanceToLeftLowerAlphabetical,distanceToLeftNumber,distanceToLeftPeriod,distanceToLeftApostrophe,distanceToLeftOtherEOS,distanceToLeftOther
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1.000000,1.000000,1.000000,1.0,1.000000,1.0,1.0,1.0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,1.000000,1.000000,1.000000,0.0,1.000000,1.0,1.0,1.0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,1.000000,1.000000,1.000000,0.0,1.000000,1.0,1.0,1.0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,1.000000,1.000000,1.000000,0.0,1.000000,1.0,1.0,1.0
4,0,0,0,1,0,0,0,0,1,0,...,0,0,1.000000,1.000000,1.000000,0.0,1.000000,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616494,0,0,1,0,0,0,0,0,0,0,...,0,0,0.040404,0.595960,0.000000,1.0,0.616162,1.0,1.0,1.0
616495,0,0,1,0,0,0,0,0,0,0,...,0,0,0.050505,0.606061,0.000000,1.0,0.626263,1.0,1.0,1.0
616496,0,0,0,0,1,0,0,0,1,0,...,0,0,0.060606,0.616162,0.010101,1.0,0.000000,1.0,1.0,1.0
616497,1,0,0,0,0,0,0,0,1,0,...,0,0,0.000000,0.626263,0.020202,1.0,0.010101,1.0,1.0,1.0


In [21]:
total_invalid = dfLabels.isnull().sum().sum()

print("Total number of invalid entries:", total_invalid)

Total number of invalid entries: 0


In [26]:
X = np.array(dfAllFeatures)
y = np.array(dfLabels).ravel()

In [27]:
X.shape

(616499, 24)

In [28]:
y.shape

(616499,)

In [29]:
model = LogisticRegression()
model.fit(X, y)

In [30]:
predictions = model.predict(X)

In [31]:
type(predictions)

numpy.ndarray

In [32]:
predictions.shape

(616499,)

In [34]:
print(predictions)

[1 0 0 ... 0 0 0]


In [36]:
np.equal(predictions, y).sum()

np.int64(615586)

In [34]:
dfBi

Unnamed: 0,bigramLeft,bigramRight
0,.,19
1,1,93
2,19,36
3,93,6
4,36,y
...,...,...
616494,zd,ı.
616495,dı,.
616496,ı.,\n
616497,.,\n


In [60]:
dfBi['Combined'] = dfBi['bigramLeft'] + ' ' + dfBi['bigramRight']
dfBi

Unnamed: 0,bigramLeft,bigramRight,Combined
0,.,19,. 19
1,1,93,1 93
2,19,36,19 36
3,93,6,93 6
4,36,y,36 y
...,...,...,...
616494,zd,ı.,zd ı.
616495,dı,.,dı .
616496,ı.,\n,ı. \n
616497,.,\n,. \n


In [61]:
combinedList = dfBi['Combined'].tolist()
combinedList

['.  19',
 ' 1 93',
 '19 36',
 '93 6 ',
 '36  y',
 '6  yı',
 ' y ıl',
 'yı lı',
 'ıl ın',
 'lı nd',
 'ın da',
 'nd ay',
 'da yı',
 'ay ız',
 'yı z.',
 'ız . ',
 'z.  A',
 '.  Ad',
 ' A de',
 'Ad et',
 'de ta',
 'et a ',
 'ta  k',
 'a  ke',
 ' k en',
 'ke nd',
 'en di',
 'nd im',
 'di md',
 'im de',
 'md en',
 'de n ',
 'en  g',
 'n  ge',
 ' g eç',
 'ge çm',
 'eç mi',
 'çm iş',
 'mi ş ',
 'iş  b',
 'ş  bi',
 ' b ir',
 'bi r ',
 'ir  h',
 'r  ha',
 ' h al',
 'ha ld',
 'al de',
 'ld ey',
 'de yi',
 'ey im',
 'yi m.',
 'im . ',
 'm.  O',
 '.  O ',
 ' O  n',
 'O  na',
 ' n as',
 'na sı',
 'as ıl',
 'sı l ',
 'ıl  d',
 'l  de',
 ' d er',
 'de rs',
 'er se',
 'rs e ',
 'se  d',
 'e  de',
 ' d es',
 'de si',
 'es in',
 'si n\n',
 'in \nu',
 'n\n uğ',
 '\nu ğr',
 'uğ ra',
 'ğr aş',
 'ra şt',
 'aş tı',
 'şt ığ',
 'tı ğı',
 'ığ ı ',
 'ğı  s',
 'ı  sa',
 ' s an',
 'sa na',
 'an at',
 'na tı',
 'at ın',
 'tı n ',
 'ın  k',
 'n  ke',
 ' k en',
 'ke nd',
 'en di',
 'nd is',
 'di si',
 'is in',
 'si n

In [62]:
vectorizer = CountVectorizer(ngram_range=(2,2), analyzer="char")
bigramFeatureMatrix = vectorizer.fit_transform(combinedList)

In [72]:
bigramFeatureMatrix.toarray()[0]

1689

In [66]:
print(vectorizer.get_feature_names_out())

['\n"' '\n%' "\n'" ... '” ' '■\n' '■ ']


In [68]:
len(vectorizer.get_feature_names_out())

1689