In [None]:
nlp = spacy.load('en_core_web_sm')



# Custom transformer to extract entities
class EntityExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, entity_types=['PERSON', 'ORG', 'GPE']):
        self.entity_types = entity_types
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [self.extract_entities(text) for text in X]
    
    def extract_entities(self, article_text):
        doc = nlp(article_text)
        entities = {entity_type: [] for entity_type in self.entity_types}
        
        for ent in doc.ents:
            if ent.label_ in entities:
                entities[ent.label_].append(ent.text)
        
        # Remove duplicates
        for key in entities:
            entities[key] = list(set(entities[key]))
        
        return entities

# Custom transformer to concatenate entity types into a single string
class EntityJoiner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [" ".join(entities['PERSON'] + entities['ORG'] + entities['GPE']) for entities in X]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

pipeline = Pipeline([
    ('entity_extractor', EntityExtractor()),  # Extract entities
    ('entity_joiner', EntityJoiner()),        # Join entities for vectorization
    ('vectorizer', TfidfVectorizer()),        # Vectorize the entities
])




In [None]:
pipeline.fit(FOXdata['text'])

In [None]:
vectorized_entities = pipeline.transform(FOXdata['text'])

In [None]:
vectorized_entities_dense = vectorized_entities.toarray()

In [None]:
print(vectorized_entities_dense.shape)


(25, 494)


In [None]:
# Fit a NearestNeighbors model on the vectorized entities
nn_model = NearestNeighbors(n_neighbors=5)
nn_model.fit(vectorized_entities_dense)

# Function to find articles by entity query
def find_articles_by_entity(query, pipeline, nn_model, articles_df):
    query_vector = pipeline.transform([query])
    
    distances, indices = nn_model.kneighbors(query_vector)
    
    # Return the most similar articles
    return articles_df.iloc[indices[0]][['title', 'link', 'pubDate']]

In [None]:
# Example search for articles mentioning "Elon Musk"
query = "Elon Musk"
similar_articles = find_articles_by_entity(query, pipeline, nn_model, FOXdata)

print(similar_articles)

                                                title  \
23  Whistleblowers make new claims about security ...   
2   Family of Oklahoma teen Noah Presgrove speaks ...   
11  Georgia high school shooting: Biden decries 'm...   
18  Hochul aide accused of working for CCP used po...   
22  California nudist ranch neighbor charged with ...   

                                                 link  \
23  https://www.foxnews.com/us/whistleblowers-make...   
2   https://www.foxnews.com/us/family-oklahoma-tee...   
11  https://www.foxnews.com/us/georgia-high-school...   
18  https://www.foxnews.com/us/hochul-aide-accused...   
22  https://www.foxnews.com/us/california-nudist-r...   

                            pubDate  
23  Wed, 04 Sep 2024 06:25:56 -0400  
2   Thu, 05 Sep 2024 04:00:30 -0400  
11  Wed, 04 Sep 2024 15:43:21 -0400  
18  Wed, 04 Sep 2024 09:39:11 -0400  
22  Wed, 04 Sep 2024 08:29:58 -0400  


In [None]:
# def extract_features(tokens):
#     features = []
#     for i, (word, tag) in enumerate(tokens):
#         token_features = {
#             'word': word,
#             'is_capitalized': word[0].isupper(),
#             'is_digit': word.isdigit(),
#             'prefix-1': word[0],  # First letter of the word
#             'suffix-3': word[-3:],  # Last 3 letters of the word
#             'pos': nlp(word)[0].pos_  # Part of speech tag
#         }
#         features.append(token_features)
#     return features

In [None]:
# Traning Set
X_train = [[token for token, _ in tokens] for tokens in train['tokens']]
y_train = [[tag for _, tag in tokens] for tokens in train['tokens']]


In [None]:
# Flatten the list of dictionaries for each token in X_train
# X_train_flattened = [item for sublist in X_train for item in sublist]
# y_train_flattened = [item for sublist in y_train for item in sublist]
# Flatten the list of dictionaries per sentence
def flatten_features_labels(X, y):
    X_flat, y_flat = [], []
    for features, labels in zip(X, y):
        X_flat.extend(features)
        y_flat.extend(labels)
    return X_flat, y_flat

X_train_flattened, y_train_flattened = flatten_features_labels(X_train, y_train)

In [None]:
# len(X_train_flattened)
len(y_train_flattened)

1981

In [None]:
# ('vectorizer', CountVectorizer(analyzer='char', ngram_range=(1, 3)))

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(X_train_flattened)

print(X.toarray())


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
# Predict on test data
y_pred = pipeline.predict(X_test)



# Generate a classification report
report = flat_classification_report(y_test, y_pred, labels=["O", "ORG", "PERSON", "LOC"], digits=4)
print(report)


NotFittedError: Vocabulary not fitted or provided