# Imports

In [2]:
import warnings
warnings.filterwarnings('ignore')
import scrapy
from scrapy.crawler import CrawlerProcess
import re
import pandas as pd
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse


# Translation

In [3]:
# Define the LAT_TO_HEB dictionary
LAT_TO_HEB = {
    ')': 'א',
    'b': 'ב',
    'g': 'ג',
    'd': 'ד',
    'h': 'ה',
    'w': 'ו',
    'z': 'ז',
    'x': 'ח',
    'T': 'ט',
    'y': 'י',
    'k': 'כ',
    'K': 'ך',
    'l': 'ל',
    'm': 'מ',
    'M': 'ם',
    'n': 'נ',
    'N': 'ן',
    's': 'ס',
    '(': 'ע',
    'p': 'פ',
    'P': 'ף',
    'c': 'צ',
    'C': 'ץ',
    'q': 'ק',
    'r': 'ר',
    '$': 'ש',
    '&': 'שׂ',
    't': 'ת',
    '"': '',
    '^': '',
}

# Function to replace Latin characters with Hebrew
def replace_with_heb_chars(word: str) -> str:
    return ''.join(LAT_TO_HEB.get(char, char) for char in word)

def lat_to_heb(st: str) -> str:
    words = st.split()
    heb_words = [replace_with_heb_chars(word) for word in words]
    return ' '.join(heb_words)

# Define all the transformation functions
def replace_sub(orig_str: str, start: int, end: int, replacement: str) -> str:
    return orig_str[:start] + replacement + orig_str[end + 1:]

def r3_slash_slash2(st: str) -> str:
    cont1 = r"(?P<cont1>\\[^/]+|[^/ ]+)"
    cont2 = r"(?P<cont2>[^/]*/)"
    return re.sub(rf"{cont1}\s*/{cont2}", '\g<cont2>', st)

def r3_slash_slash(st: str) -> str:
    r3 = re.compile(r'(cont) "/" (cont) "/"')
    matches = r3.finditer(st)
    if not matches:
        return st
    new_st = st
    for match in reversed(list(matches)):
        start = match.start()
        end = match.end()
        new_st = replace_sub(
            new_st,
            start,
            end - 1,
            new_st[match.start(1): match.end(1)]
        )
    return new_st

def r_5_4_hash_hash(st: str) -> str:
    return re.sub(r'#[^#]*#', '', st)

def r2_squigglie_slash_slash(st: str) -> str:
    return re.sub(r'"/"', '', st)

def r11_double_squiggles(st: str) -> str:
    return re.sub(r'\{\{|\}\}', '', st)

def r12_14_question(st: str) -> str:
    return re.sub(r'\?', '', st)

def r1_squiggles(st: str) -> str:
    return re.sub(r'\{[^{}]*\}', '', st)

def r6_quotes(st: str) -> str:
    # Remove all substrings enclosed in double quotes
    return re.sub(r'"[^"]*"', '', st)

def r8_7_dots(st: str) -> str:
    return re.sub(r'\.{2,}', '', st)

def r9_chevrons(st: str) -> str:
    return re.sub(r'[<>]', '', st)

def r10_carets(st: str) -> str:
    return re.sub(r'\^[^^]*\^', '', st)

def r13_backslash(st: str) -> str:
    return st.replace('\\', '')

def r15_caret(st: str) -> str:
    return re.sub(r'\^', '', st)

def r16_comma(st: str) -> str:
    return re.sub(r',', '', st)

def r17_plus(st: str) -> str:
    return re.sub(r'\+', '', st)

def r18_at_equals_dash(st: str) -> str:
    return st.replace('@', ' ').replace('=', '').replace('-', '')

def r19_squiggle_space(st: str) -> str:
    return st.replace('{ ', ' ')

def r20_remove_braces(st: str) -> str:
    return st.replace('[', '').replace(']', '')

def r21_remove_bang(st: str) -> str:
    return st.replace('!', '')

def r22_backtick_to_space(st: str) -> str:
    return st.replace('`', ' ')

def r_WS(st: str) -> str:
    st = re.sub(r' +', ' ', st)
    st = st.strip()
    return st

def transform_str(st: str) -> str:
    st = lat_to_heb(st)  # Apply Latin to Hebrew transformation first
    rules = [
        r9_chevrons,
        r_5_4_hash_hash,
        r16_comma,
        r12_14_question,
        r10_carets,
        r15_caret,
        r17_plus,
        r8_7_dots,
        r3_slash_slash2,
        r6_quotes,
        r13_backslash,
        r2_squigglie_slash_slash,
        r11_double_squiggles,
        r1_squiggles,
        r18_at_equals_dash,
        r19_squiggle_space,
        r20_remove_braces,
        r21_remove_bang,
        r22_backtick_to_space,
        r_WS
    ]

    for f in rules:
        st = f(st)
    return st

def apply_transformation(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    df[f'{column_name}_transformed'] = df[column_name].apply(transform_str)
    return df

if __name__ == "__main__":
    # Create a sample DataFrame
    df_check = pd.DataFrame({'text': ['^tn)^', '"lytny"']})
    
    # Apply transformation
    df_check = apply_transformation(df_check, 'text')
    
    # Display the transformed DataFrame
df_check

Unnamed: 0,text,text_transformed
0,^tn)^,תנא
1,"""lytny""",ליתני


# Sort

In [4]:

def replace_x_in_coord(url: str) -> str:
    # Parse the URL
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    
    # Get and modify the 'coord' parameter
    coord = query_params.get('coord', [''])[0]
    coord = coord.replace('x', '1')
    
    # Update query parameters
    query_params['coord'] = [coord]
    
    # Rebuild the URL with updated parameters
    new_query_string = urlencode(query_params, doseq=True)
    new_url = urlunparse((
        parsed_url.scheme,
        parsed_url.netloc,
        parsed_url.path,
        parsed_url.params,
        new_query_string,
        parsed_url.fragment
    ))
    
    return new_url




In [5]:

def sort_by_coord_and_word(df, url_column='url'):
    # Extract 'coord' and 'word' values using str.extract
    df[['coord', 'word']] = df[url_column].str.extract(r'coord=(\d+)&word=(\d+)')

    # Ignore the first 7 digits of 'coord', and handle NaN values by filling them with a placeholder
    df['coord'] = df['coord'].str[7:].fillna('0')
    df['word'] = df['word'].fillna('0')

    # Convert 'coord' and 'word' to integers for correct sorting
    df['coord'] = df['coord'].astype(int)
    df['word'] = df['word'].astype(int)

    # Sort by 'coord' and then by 'word'
    sorted_df = df.sort_values(by=['coord', 'word'], ascending=[True, True])

    # Drop the temporary columns 'coord' and 'word'
    sorted_df = sorted_df.drop(columns=['coord', 'word'])

    return sorted_df

# Data Process

In [6]:
def data_process(file_path): 
    
    # Read the JSON file into a DataFrame
    df = pd.read_json(file_path)

    # Apply transformation
    df = apply_transformation(df, 'text')

    # Apply the transformation
    df['url'] = df['url'].apply(replace_x_in_coord)

    # Assuming your DataFrame is named 'df' and has a column 'url'
    sorted_df = sort_by_coord_and_word(df)

    # Display the sorted DataFrame
    return sorted_df

# Convert to Dataframe

In [7]:

# Path to the JSON file
file_path = r'C:\Users\USER\Desktop\Final_Project\Data\Masekhet_ber.json'

# Read the JSON file into a DataFrame
df = pd.read_json(file_path)

# Display the DataFrame
df

Unnamed: 0,text,url,lexicon_0,lexicon_1,lexicon_2,lexicon_3,lexicon_4
0,tn),bablex.php?coord=7100101002115&word=0,\n\n\ntn) noun sg. emphatic,,,,
1,"""(rbyt""",bablex.php?coord=7100101002124&word=1,"there is no data for this word, it may be unde...",,,,
2,lytny,bablex.php?coord=7100101002124&word=0,\n\n\ntny verb G,,,,
3,tn),bablex.php?coord=7100101002126&word=0,\n\n\ntny verb G,,,,
4,tn),bablex.php?coord=7100101002125&word=1,\n\n\ntn) noun sg. emphatic,,,,
...,...,...,...,...,...,...,...
17532,wtw;,bablex.php?coord=7100101002116&word=0,\n\n\nw_ c,tw a = tw X --> twb X,,,
17533,"""m)ymty""",bablex.php?coord=7100101002115&word=4,"there is no data for this word, it may be unde...",,,,
17534,dqtny,bablex.php?coord=7100101002115&word=3,\n\n\nd_ c = d_ c --> dy c,tny verb G,,,
17535,q)y,bablex.php?coord=7100101002115&word=2,\n\n\nqwm verb G,,,,


In [8]:
# Count the number of None and NaN values in each column
null_counts = df.isnull().sum()

# Display the count of None and NaN values for each column
print(null_counts)

text             0
url              0
lexicon_0        0
lexicon_1    14246
lexicon_2    17490
lexicon_3    17533
lexicon_4    17537
dtype: int64


In [9]:
# To check unique values in a specific column, e.g., 'text'
unique_values = df['lexicon_3'].unique()

# Display the unique values
print(unique_values)


[None 'Trz noun pl. emphatic' '(lm noun sg. emphatic'
 'm(rb noun sg. emphatic']


In [10]:
# To check unique values in a specific column, e.g., 'text'
unique_values = df['lexicon_2'].unique()

# Display the unique values
print(unique_values)


[None ')tr) noun sg. abs. or construct' 'lyly) noun sg. emphatic'
 'dr$ verb G' 'hyk) a' 'd(t) noun sg. emphatic' 'Twrzyn noun sg. emphatic'
 '(lm noun sg. emphatic' '&m)l) noun sg. emphatic= &m)l N --> sm)l N'
 '$lp verb G' 'hytyr) noun sg. emphatic'
 'rbnn noun pl. emphatic= rbnn N --> rbn N' 'ry$@glwt) noun sg. emphatic'
 'dyd_ P02' 'kl noun pl. construct' 'mTwy A02'
 'non-Aramaic or fragmentary form' 'br noun pl. construct'
 'rb noun sg. emphatic' 'hk P01' 'h) P01' 'lgby p01' ')mr verb G'
 'ywm) noun pl. construct' 'qysr noun sg. abs. or construct'
 '$bh noun sg. emphatic' '(bd verb G' 'c(r) noun sg. emphatic'
 'tny verb C' 'tmrt) noun pl. emphatic' 'xmr) noun sg. emphatic'
 '(l@krx_ p02']


# Translation

In [11]:
# Create a sample DataFrame
# df = pd.DataFrame({'text': ['tn)', 'lytny']})
    
# Apply transformation
df = apply_transformation(df, 'text')
    
# Display the transformed DataFrame
df

Unnamed: 0,text,url,lexicon_0,lexicon_1,lexicon_2,lexicon_3,lexicon_4,text_transformed
0,tn),bablex.php?coord=7100101002115&word=0,\n\n\ntn) noun sg. emphatic,,,,,תנא
1,"""(rbyt""",bablex.php?coord=7100101002124&word=1,"there is no data for this word, it may be unde...",,,,,ערבית
2,lytny,bablex.php?coord=7100101002124&word=0,\n\n\ntny verb G,,,,,ליתני
3,tn),bablex.php?coord=7100101002126&word=0,\n\n\ntny verb G,,,,,תנא
4,tn),bablex.php?coord=7100101002125&word=1,\n\n\ntn) noun sg. emphatic,,,,,תנא
...,...,...,...,...,...,...,...,...
17532,wtw;,bablex.php?coord=7100101002116&word=0,\n\n\nw_ c,tw a = tw X --> twb X,,,,ותו;
17533,"""m)ymty""",bablex.php?coord=7100101002115&word=4,"there is no data for this word, it may be unde...",,,,,מאימתי
17534,dqtny,bablex.php?coord=7100101002115&word=3,\n\n\nd_ c = d_ c --> dy c,tny verb G,,,,דקתני
17535,q)y,bablex.php?coord=7100101002115&word=2,\n\n\nqwm verb G,,,,,קאי


In [12]:
# Apply the transformation
df['url'] = df['url'].apply(replace_x_in_coord)

# Assuming your DataFrame is named 'df' and has a column 'url'
sorted_df = sort_by_coord_and_word(df)

# Display the sorted DataFrame
sorted_df

Unnamed: 0,text,url,lexicon_0,lexicon_1,lexicon_2,lexicon_3,lexicon_4,text_transformed
0,tn),bablex.php?coord=7100101002115&word=0,\n\n\ntn) noun sg. emphatic,,,,,תנא
17536,h^y^k),bablex.php?coord=7100101002115&word=1,\n\n\nhyk) a,,,,,היכא
17535,q)y,bablex.php?coord=7100101002115&word=2,\n\n\nqwm verb G,,,,,קאי
17534,dqtny,bablex.php?coord=7100101002115&word=3,\n\n\nd_ c = d_ c --> dy c,tny verb G,,,,דקתני
17533,"""m)ymty""",bablex.php?coord=7100101002115&word=4,"there is no data for this word, it may be unde...",,,,,מאימתי
...,...,...,...,...,...,...,...,...
365,l),bablex.php?coord=7100101064109&word=3,\n\n\nl) a,,,,,לא
364,qdM,bablex.php?coord=7100101064109&word=4,\n\n\nqdm verb G,,,,,קדם
363,lbytyh,bablex.php?coord=7100101064109&word=5,\n\n\nl_ p03,byt) noun sg. emphatic,,,,לביתיה
362,drb,bablex.php?coord=7100101064109&word=6,\n\n\nd_ p = d_ p --> dy p,rb noun sg. emphatic,,,,דרב


In [13]:
# Count empty values in the specified column
empty_count = sorted_df['text_transformed'].isna().sum() + (sorted_df['text_transformed'] == '').sum()

print(f"Number of empty values in the column: {empty_count}")

Number of empty values in the column: 55


In [14]:
# Filter rows with empty values
empty_rows = sorted_df[sorted_df['text_transformed'].isna() | (sorted_df['text_transformed'] == '')]

# Print the empty rows
print("Rows where 'text_transformed' is empty:")
empty_rows


Rows where 'text_transformed' is empty:


Unnamed: 0,text,url,lexicon_0,lexicon_1,lexicon_2,lexicon_3,lexicon_4,text_transformed
17364,<...>,bablex.php?coord=7100101002232&word=5,"there is no data for this word, it may be unde...",,,,,
16875,"...""",bablex.php?coord=7100101004237&word=3,"there is no data for this word, it may be unde...",,,,,
16430,"...""",bablex.php?coord=7100101006152&word=13,"there is no data for this word, it may be unde...",,,,,
16424,"...""",bablex.php?coord=7100101006152&word=18,"there is no data for this word, it may be unde...",,,,,
16421,"...""",bablex.php?coord=7100101006152&word=22,"there is no data for this word, it may be unde...",,,,,
16417,"...""",bablex.php?coord=7100101006152&word=27,"there is no data for this word, it may be unde...",,,,,
16395,"...""",bablex.php?coord=7100101006154&word=4,"there is no data for this word, it may be unde...",,,,,
16250,"...""",bablex.php?coord=7100101007122&word=16,"there is no data for this word, it may be unde...",,,,,
15652,"...""",bablex.php?coord=7100101009138&word=2,"there is no data for this word, it may be unde...",,,,,
15622,"...""",bablex.php?coord=7100101009140&word=22,"there is no data for this word, it may be unde...",,,,,


# Dataframe converter

In [15]:
check_df = data_process(r'C:\Users\USER\Desktop\Final_Project\Data\Masekhet_er.json')

In [16]:
check_df

Unnamed: 0,text,url,lexicon_0,lexicon_1,lexicon_2,lexicon_3,lexicon_4,text_transformed
0,m)y,bablex.php?coord=7100301002108&word=0,\n\n\nm)y P01,,,,,מאי
15517,$n),bablex.php?coord=7100301002108&word=1,\n\n\n$ny verb G,,,,,שנא
15516,gby,bablex.php?coord=7100301002108&word=2,\n\n\ngby p01 = gby p --> gb p,,,,,גבי
15515,"""swkh""",bablex.php?coord=7100301002108&word=3,"there is no data for this word, it may be unde...",,,,,סוכה
15513,dtny,bablex.php?coord=7100301002108&word=4,\n\n\nd_ c = d_ c --> dy c,tny verb G,,,,דתני
...,...,...,...,...,...,...,...,...
293,)yhw;,bablex.php?coord=7100301105151&word=5,\n\n\n)yhw P01,,,,,איהו;
292,my,bablex.php?coord=7100301105151&word=6,\n\n\nmy c,,,,,מי
291,l),bablex.php?coord=7100301105151&word=7,\n\n\nl) a,,,,,לא
290,)sr,bablex.php?coord=7100301105151&word=8,\n\n\n)sr verb G,,,,,אסר


In [17]:
check_df = check_df.drop(['lexicon_4'], axis=1)


In [18]:
check_df

Unnamed: 0,text,url,lexicon_0,lexicon_1,lexicon_2,lexicon_3,text_transformed
0,m)y,bablex.php?coord=7100301002108&word=0,\n\n\nm)y P01,,,,מאי
15517,$n),bablex.php?coord=7100301002108&word=1,\n\n\n$ny verb G,,,,שנא
15516,gby,bablex.php?coord=7100301002108&word=2,\n\n\ngby p01 = gby p --> gb p,,,,גבי
15515,"""swkh""",bablex.php?coord=7100301002108&word=3,"there is no data for this word, it may be unde...",,,,סוכה
15513,dtny,bablex.php?coord=7100301002108&word=4,\n\n\nd_ c = d_ c --> dy c,tny verb G,,,דתני
...,...,...,...,...,...,...,...
293,)yhw;,bablex.php?coord=7100301105151&word=5,\n\n\n)yhw P01,,,,איהו;
292,my,bablex.php?coord=7100301105151&word=6,\n\n\nmy c,,,,מי
291,l),bablex.php?coord=7100301105151&word=7,\n\n\nl) a,,,,לא
290,)sr,bablex.php?coord=7100301105151&word=8,\n\n\n)sr verb G,,,,אסר
