In [None]:
def is_date(d):
    # Define the regex pattern for different date formats
    pattern = r'^(?:\d{4}-\d{2}-\d{2})|(?:\d{2}/\d{2}/\d{4})|(?:\d{2}-\d{2}-\d{4})$'
    # Match the pattern against the input string
    return bool(re.match(pattern, s))
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'word': word.lower(),
        'is_title': word.istitle(),
        'is_upper': word.isupper(),
        'is_lower': word.islower(),
        'suffix-1': word[-1],
        'prefix-1': word[0],
        'pos': sent[i][1],
        'is_date': is_date(word)
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1.lower(),
            '-1:is_title': word1.istitle(),
            '-1:is_upper': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1.lower(),
            '+1:is_title': word1.istitle(),
            '+1:is_upper': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sentence2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sentence2labels(sent):
    return [label for token, pos, chunk, label in sent]


In [None]:
def read_conll_file(file_path):
    with open(file_path, "r") as f:
        content = f.read().strip()
        # print(content)
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

In [None]:
train_data = read_conll_file("eng.train")
validation_data = read_conll_file("eng.testa")
test_data = read_conll_file("eng.testb")

# train_data

In [None]:
train_data[0]

[['-DOCSTART-', '-X-', '-X-', 'O']]

In [None]:
X_train = [sentence2features(sent) for sent in train_data]
y_train = [sentence2labels(sent) for sent in train_data]
X_test = [sentence2features(sent) for sent in test_data]
y_test = [sentence2labels(sent) for sent in test_data]


In [None]:
X_train

[[{'word': '-docstart-',
   'is_title': False,
   'is_upper': True,
   'is_lower': False,
   'suffix-1': '-',
   'prefix-1': '-',
   'pos': '-X-',
   'BOS': True,
   'EOS': True}],
 [{'word': 'eu',
   'is_title': False,
   'is_upper': True,
   'is_lower': False,
   'suffix-1': 'U',
   'prefix-1': 'E',
   'pos': 'NNP',
   'BOS': True,
   '+1:word': 'rejects',
   '+1:is_title': False,
   '+1:is_upper': False},
  {'word': 'rejects',
   'is_title': False,
   'is_upper': False,
   'is_lower': True,
   'suffix-1': 's',
   'prefix-1': 'r',
   'pos': 'VBZ',
   '-1:word': 'eu',
   '-1:is_title': False,
   '-1:is_upper': True,
   '+1:word': 'german',
   '+1:is_title': True,
   '+1:is_upper': False},
  {'word': 'german',
   'is_title': True,
   'is_upper': False,
   'is_lower': False,
   'suffix-1': 'n',
   'prefix-1': 'G',
   'pos': 'JJ',
   '-1:word': 'rejects',
   '-1:is_title': False,
   '-1:is_upper': False,
   '+1:word': 'call',
   '+1:is_title': False,
   '+1:is_upper': False},
  {'word': 

In [None]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer(sparse=True)
X_train = vec.fit_transform([f for sent in X_train for f in sent])
y_train = [label for sent in y_train for label in sent]

X_test = vec.transform([f for sent in X_test for f in sent])
y_test = [label for sent in y_test for label in sent]
import joblib

# Assuming `vec` is your fitted DictVectorizer
joblib.dump(vec, 'dict_vectorizer.pkl')


['dict_vectorizer.pkl']

In [None]:
X_train[0]

<1x61615 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

model = LinearSVC()
model.fit(X_train, y_train_enc)


In [None]:
print(X_test[0])
# print(y_test[0])

  (0, 40387)	1.0
  (0, 40388)	1.0
  (0, 40389)	0.0
  (0, 40390)	0.0
  (0, 40391)	1.0
  (0, 40398)	1.0
  (0, 40449)	1.0
  (0, 40533)	1.0
  (0, 40727)	1.0


In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
# y_pred_labels = label_encoder.inverse_transform(y_pred)

print(classification_report(y_test_enc, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.87      0.86      1668
           1       0.80      0.80      0.80       702
           2       0.82      0.72      0.77      1661
           3       0.85      0.86      0.85      1617
           4       0.73      0.74      0.74       257
           5       0.70      0.66      0.68       216
           6       0.75      0.72      0.73       835
           7       0.86      0.96      0.91      1156
           8       0.99      0.99      0.99     38554

    accuracy                           0.96     46666
   macro avg       0.82      0.81      0.81     46666
weighted avg       0.96      0.96      0.96     46666



In [None]:
import nltk
from nltk import pos_tag, word_tokenize
from nltk.chunk import RegexpParser

# Download necessary resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Define your sentence
sentence = "Washington DC is a city in USA"
# sentence = "United States"

# Tokenize and POS tag the sentence
words = word_tokenize(sentence)
pos_tags = pos_tag(words)
print("POS Tags:", pos_tags)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


POS Tags: [('Washington', 'NNP'), ('DC', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('city', 'NN'), ('in', 'IN'), ('USA', 'NNP')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
data = [word2features(pos_tags, i) for i in range(len(pos_tags))]
y_pred = model.predict(vec.transform(data))
print(label_encoder.inverse_transform(y_pred))


['B-LOC' 'I-LOC' 'O' 'O' 'O' 'O' 'B-LOC']
