In [47]:
# %pip install --upgrade langchain
# %pip install --upgrade llama-cpp-python

from langchain.embeddings import LlamaCppEmbeddings
import numpy as np

llama_model_path = "../../llama2/llama.cpp/models/7B/ggml-model-q4_0.bin"
embeddings = LlamaCppEmbeddings(model_path=llama_model_path, n_ctx=2048)




llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from ../llama2/llama.cpp/models/7B/ggml-model-q4_0.bin (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:               output_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:                    output.weight q6_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_q.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.attn_k.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:              blk.0.attn_v.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    6:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_gate.weight q4_0     [  4096,

In [48]:
# Structure initial data
import pandas as pd

input_path = "../datasets/90minFootballTransferNewsNLP.csv"
df = pd.read_csv(input_path)
df = df[["Title","Date", "Link", "Content"]]
df = df.dropna()
# Concat columns of interest, this new column will be the base that will be embedded.
df["Combined"] = (
    "Title: " + df.Title.str.strip() + "; Content: " + df.Content.str.strip()
)
df.head(2)


Unnamed: 0,Title,Date,Link,Content,Combined
0,Football transfer rumours: Why Maguire's Man U...,"Aug 15, 2023",https://www.90min.com/posts/football-transfer-...,"90minÂ rounds up the latestÂ transfer news, ru...",Title: Football transfer rumours: Why Maguire'...
1,Chelsea agree Romeo Lavia fee with Southampton,"Aug 15, 2023",https://www.90min.com/posts/chelsea-agree-rome...,Chelsea have finalised an agreement with South...,Title: Chelsea agree Romeo Lavia fee with Sout...


In [49]:
# Check if any row in the "Combined" column is empty
empty_rows = df[df["Combined"].str.strip() == ""]
if not empty_rows.empty:
    print("Empty rows in 'Combined' column:")
    print(empty_rows)


In [50]:
# prepare sets

top_n = 100
df_test = df.iloc[(top_n + 1):(top_n+ 6)]
df = df.iloc[:top_n]

df.head(100)

Unnamed: 0,Title,Date,Link,Content,Combined
0,Football transfer rumours: Why Maguire's Man U...,"Aug 15, 2023",https://www.90min.com/posts/football-transfer-...,"90minÂ rounds up the latestÂ transfer news, ru...",Title: Football transfer rumours: Why Maguire'...
1,Chelsea agree Romeo Lavia fee with Southampton,"Aug 15, 2023",https://www.90min.com/posts/chelsea-agree-rome...,Chelsea have finalised an agreement with South...,Title: Chelsea agree Romeo Lavia fee with Sout...
2,Harry Maguire's proposed West Ham transfer col...,"Aug 15, 2023",https://www.90min.com/posts/harry-maguire-prop...,Harry Maguire's proposed transfer to West Ham ...,Title: Harry Maguire's proposed West Ham trans...
3,Southampton director breaks down Chelsea & Liv...,"Aug 15, 2023",https://www.90min.com/posts/southampton-direct...,Southampton director Jason Wilcox has revealed...,Title: Southampton director breaks down Chelse...
4,Neymar completes move from PSG to Al Hilal,"Aug 15, 2023",https://www.90min.com/posts/neymar-completes-m...,Saudi Pro League side Al Hilal have confirmed ...,Title: Neymar completes move from PSG to Al Hi...
...,...,...,...,...,...
95,Tottenham exploring new striker options despit...,"Aug 9, 2023",https://www.90min.com/posts/tottenham-new-stri...,Tottenham Hotspur are exploring opportunities ...,Title: Tottenham exploring new striker options...
96,Man City confident star duo will sign new cont...,"Aug 9, 2023",https://www.90min.com/posts/man-city-confident...,Manchester City believe they have convinced bo...,Title: Man City confident star duo will sign n...
97,Barcelona provide update on Ousmane Dembele's ...,"Aug 8, 2023",https://www.90min.com/posts/barcelona-update-o...,Barcelona vice-president Rafa Yuste has confir...,Title: Barcelona provide update on Ousmane Dem...
98,Chelsea send summer signing to Strasbourg on loan,"Aug 8, 2023",https://www.90min.com/posts/chelsea-send-summe...,Chelsea have confirmed that winger Angelo Gabr...,Title: Chelsea send summer signing to Strasbou...


In [51]:
# Define a function to embed the query and handle errors
def embed_query_safe(query):
    try:
        return embeddings.embed_query(query)
    except Exception as e:
        print(f"Error embedding query: {query}")
        print(f"Error message: {str(e)}")
        return np.nan  # You can use np.nan to mark problematic embeddings

In [22]:


# Long process... running with 100 rows took 40 min on M1 mac

# Apply the function to the "Combined" column
df["embedding"] = df["Combined"].apply(embed_query_safe)

# Check for rows with NaN embeddings
problematic_rows = df[df["embedding"].isna()]
print("Problematic Rows:")
print(problematic_rows)

# Optionally, you can drop rows with NaN embeddings
df = df.dropna(subset=["embedding"])
df.to_csv("./results/footballTransferNewsEmbeddingsLLAMA.csv")

# Now, df should contain valid embeddings for non-problematic rows



llama_print_timings:        load time =   529.02 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 24472.74 ms /   546 tokens (   44.82 ms per token,    22.31 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 24555.29 ms

llama_print_timings:        load time =   529.02 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 17148.23 ms /   397 tokens (   43.19 ms per token,    23.15 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 17206.55 ms

llama_print_timings:        load time =   529.02 ms
llama_print_timings:   

Error embedding query: Title: Every time British transfer record broken in Premier League era; Content: As the years go by, more and more money is being spent by clubs in the transfer market. Lucrative broadcasting rights, commercial deals and other premium sponsorship means clubs are richer than ever, particularly if they have happen to being playing in the Premier League. The biggest transfers seen in world football may have happened around continental Europe, sure, but the level of spending in England, and regularity of it, is at an all-time high. And with Moises Caicedo's Â£115m move from Brighton to Chelsea now complete, the British transfer record has been smashed a whopping 19 times since the Premier League's inception in 1992. Here's every player and move to hold that accolade... Alan Shearer had only scored 23 goals in 118 appearances for Southampton by the time he left for Blackburn Rovers in 1992 - an average a goal just over every five games. But the soon-to-be Premier Leag


llama_print_timings:        load time =   529.02 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 11639.05 ms /   270 tokens (   43.11 ms per token,    23.20 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 11678.80 ms

llama_print_timings:        load time =   529.02 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 27993.90 ms /   628 tokens (   44.58 ms per token,    22.43 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 28084.73 ms

llama_print_timings:        load time =   529.02 ms
llama_print_timings:   

Problematic Rows:
                                                Title          Date  \
11  Every time British transfer record broken in P...  Aug 15, 2023   

                                                 Link  \
11  https://www.90min.com/posts/every-time-british...   

                                              Content  \
11  As the years go by, more and more money is bei...   

                                             Combined embedding  
11  Title: Every time British transfer record brok...       NaN  


In [52]:
# Get some test data

# Apply the function to the "Combined" column
df_test["embedding"] = df_test["Combined"].apply(embed_query_safe)

# Check for rows with NaN embeddings
problematic_rows = df_test[df_test["embedding"].isna()]
print("Problematic Rows:")
print(problematic_rows)

# Optionally, you can drop rows with NaN embeddings
df_test = df_test.dropna(subset=["embedding"])
df_test.to_csv("./results/footballTransferNewsTestEmbeddingsLLAMA.csv")


llama_print_timings:        load time =  3544.53 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 17394.09 ms /   323 tokens (   53.85 ms per token,    18.57 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 17443.90 ms

llama_print_timings:        load time =  3544.53 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 18796.27 ms /   422 tokens (   44.54 ms per token,    22.45 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 18858.20 ms

llama_print_timings:        load time =  3544.53 ms
llama_print_timings:   

Problematic Rows:
Empty DataFrame
Columns: [Title, Date, Link, Content, Combined, embedding]
Index: []



llama_print_timings:        load time =  3544.53 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 20525.64 ms /   467 tokens (   43.95 ms per token,    22.75 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 20591.93 ms
