In [63]:
import csv

# Define the input and output file paths
input_file = "/content/preprocessed_dataset.txt"
output_file = "data.csv"

# Column names for the CSV file
header = ["word", "chunk", "postposition", "head-postag", "dependency", "is_arg", "srl", "predicate"]

# Function to parse each line and extract relevant information
def parse_line(line):
    elements = line.split()
    word = elements[0] if len(elements) > 0 else ""
    chunk = elements[1] if len(elements) > 1 else ""
    postposition = elements[2].split('_')[1] if len(elements) > 2 and '_' in elements[2] else ""
    head_postag = elements[3] if len(elements) > 3 else ""
    dependency = elements[4] if len(elements) > 4 else ""
    is_arg = elements[5] if len(elements) > 5 else ""
    srl = elements[6] if len(elements) > 6 else ""
    predicate = elements[7] if len(elements) > 7 else ""
    return [word, chunk, postposition, head_postag, dependency, is_arg, srl, predicate]



cnt = 0
validlines =0
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", newline="", encoding="utf-8") as outfile:
    reader = infile.readlines()
    writer = csv.writer(outfile)
    writer.writerow(header)
    for line in reader:
        cnt+=1
        row = parse_line(line.strip())
        if row is not None:  # Check if row is not None before writing
            validlines+=1
            writer.writerow(row)

print("CSV file has been created successfully.")
print(validlines,cnt)

CSV file has been created successfully.
14412 14412


In [64]:
# prompt: readcsv file using pandas and display head

import pandas as pd

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('data.csv')

# Display the first few rows of the DataFrame
print(df)


            word chunk postposition head-postag   dependency  is_arg  \
0          सरकार    NP           का         NP2           r6     0.0   
1         भूमिका   NP2          NaN        VGNF           k2     1.0   
2         अपनाते  VGNF        हो+एं         NP4  nmod__k1inv     0.0   
3         झारखंड   NP3          में        VGNF          k7p     1.0   
4      छापामारों   NP4           का         VGF           k1     1.0   
...          ...   ...          ...         ...          ...     ...   
14407      माहौल   NP5          NaN        VGNN          pof     0.0   
14408      बनाने  VGNN           के         VGF           rt     1.0   
14409        देश   NP6          में         VGF          k7p     1.0   
14410     अभियान   NP7          NaN         VGF           k2     1.0   
14411    चलाएंगे   VGF          NaN           0         root     0.0   

            srl predicate  
0           NaN       NaN  
1          ARG1      VGNF  
2           NaN       NaN  
3      ARGM-LOC      VG

In [20]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4227139 sha256=316aafee42bc7fbb4cf3e06c52984beb53904237a1268682fcdd3349d437b380
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.12.0


In [1]:
!pip install progressbar2




In [13]:
import fasttext.util

# Download the Hindi fastText model if it hasn't been downloaded already
fasttext.util.download_model('hi', if_exists='ignore')  # Hindi


ft_hindi = fasttext.load_model('cc.hi.300.bin')


Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz





In [71]:
import csv
def generate_embedding(word):
    try:
        embedding = ft_hindi.get_word_vector(word)
        return embedding
    except KeyError:
        return [0.0] * 300

data = []
with open("data.csv", "r", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        word = row["word"]
        embedding = generate_embedding(word)
        data.append({"word": embedding})


output_file = "embeddings.csv"
with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["word"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in data:
        writer.writerow(row)

print("Embeddings have been written to embeddings.csv")

Embeddings have been written to embeddings.csv


In [72]:
import pandas as pd

df = pd.read_csv('embeddings.csv')

In [81]:
df["word"] = df["word"].str.strip("[]")

# Split each embedding string into individual dimensions
df["word"] = df["word"].apply(lambda x: x.split())

# Expand the list of dimensions into separate columns
embedding_df = df["word"].apply(pd.Series)

embedding_df.columns = [f"dim_{i+1}" for i in range(embedding_df.shape[1])]

# Concatenate the original DataFrame with the new DataFrame containing embeddings
result_df = pd.concat([df.drop(columns=["word"]), embedding_df], axis=1)

result_df.to_csv("final_data.csv", index=False)

print("Embeddings with dimensions have been written to embeddings_with_dimensions.csv")

Embeddings with dimensions have been written to embeddings_with_dimensions.csv


In [83]:
embeddings_df = pd.read_csv("embeddings_with_dimensions.csv")

data_df = pd.read_csv("data.csv")

merged_df = pd.concat([embeddings_df, data_df.iloc[:, 1:]], axis=1)  # Assuming additional columns start from the second column

merged_df.to_csv("merged_embeddings_data.csv", index=False)

print("Merged data with embeddings have been written to merged_embeddings_data.csv")


Merged data with embeddings have been written to merged_embeddings_data.csv
