In [17]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

## Data Scraping

In [18]:
from bs4 import BeautifulSoup
import requests
from pathlib import Path
from tqdm import tqdm

In [19]:

def scrap_data(wiki_link, save_file_path = "./input_data/movie.txt"):
    wikipedia_movie_link = wiki_link
    
    page_to_scrape = requests.get(wikipedia_movie_link)
    soup = BeautifulSoup(page_to_scrape.text, "html.parser")
    
    para = ''
    for paragraph in soup.select('p'):
        p = paragraph.getText()
        para += p


     # Open the file in write mode and save the paragraph
    with open(save_file_path, 'w') as file:
        file.write(para)
    
    # print(f"Paragraph saved to {save_file_path}")

In [20]:
def save_aspect(aspect, content, save_file_path = "./input_data/movie.txt"):
    # print(f"from save aspect, {aspect}")
    introduction = f"HERE IS THE DETAILS OF MOVIE'S {aspect.upper()}: \n"
    underline = "-----------------------------------------------------\n"
    para = introduction + underline + content
    # print(para)
    # input()
    # print(para)
    # Open the file in write mode and save the paragraph
    with open(save_file_path, 'w') as file:
        file.write(para)
    
    # print(f"Paragraph saved to {save_file_path} for {aspect}.")


def scrape(wiki_link):
    Aspects = [
        'Plot',
        'Cast',
        'Production',
        'Music',
        'Soundtrack',
        'Themes',
        'Accolades',
        ]

    Aspects = [aspect.lower() for aspect in Aspects]
    
    # Send a request to the Wikipedia page
    response = requests.get(wiki_link)
    
    # Parse the page content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    scraped_dict = {}
    
    #-------------------Summary scraping--------------------------
    # Find the first paragraph after the title, which is usually the summary
    summary_paragraphs = []
    # Wikipedia's summary paragraphs are inside <p> tags but before any <h2> tag
    for paragraph in soup.find_all('p'):
        
        # Ensure the paragraph has text and is not empty
        if paragraph.get_text().strip():
            summary_paragraphs.append(paragraph.get_text().strip())
        
        # Stop once we hit the first section heading (e.g. 'Plot' or 'Contents')
        if paragraph.find_next_sibling(['h2', 'h3']):
            break
    
    scraped_dict["summary"] =  ' '.join(summary_paragraphs)
    
    
    # -------------------Other Aspect Scraping---------------------
    Headings = soup.find_all('div', class_ = 'mw-heading mw-heading2')
    for heading in Headings:
        try:
            aspect, _ = (heading.text).split('[')
        except:
            aspect = heading.text
            
        if aspect.lower() in Aspects:
            next_siblings = heading.find_next_siblings()
            text = ''
            for next_sibling in next_siblings:
                next_sibling_name = next_sibling.name
                # sub_sibling = ''
                # print(f"............next_sibling_name = {next_sibling_name}")
                if (next_sibling_name =='style'):
                    continue
                
                elif (next_sibling_name == 'div'):
                    clss = next_sibling.get('class')
                    
                    if ('mw-heading2' in clss):  # break because heading ended
                        break
                text += " "+ next_sibling.text
            scraped_dict[aspect] = text 
    return scraped_dict

## Loading Document

In [21]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [22]:
## Input data directory
inputdirectory = Path(f"./input_data")
## This is where the output csv files will be written
outputdirectory = Path(f"./output_data")

## Node generation

In [24]:
# sudo apt-get install libmagic1
# curl -fsSL https://ollama.com/install.sh | sh
#ollama serve
#ollama run zephyr

In [25]:
# !pip install openpyxl
# !pip install pandas
# !pip install langchain
# !pip install -U langchain-community
# !pip install unstructured
# !pip install yachalk

## Strat from here

In [28]:
# !pip install openpyxl
import uuid

import sys
sys.path.append("..")

import json
import ollama.client as client

In [None]:
def doc_to_df(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df

In [None]:
def prompt(input: str, metadata={}, model="mistral-openorca:latest"):
    if model == None:
        model = "mistral-openorca:latest"

    # model_info = client.show(model_name=model)
    # print( chalk.blue(model_info))



    SYS_PROMPT = (
        "You create network graphs by identifying terms and their relationships within a given context. "
        "You are given a context chunk (enclosed by ```). Your task is to identify the key concepts mentioned" 
        "in the context and extract their ontology. \n"
        
        
        "Thought 1: When analyzing the text, carefully identify significant terms from each sentence. \n" 
        "These terms should represent essential elements such as objects, places, people, organizations, \n" 
        "concepts, acronums, or services. Aim to extract terms in their simplest form, ensuring each one captures a core idea without unncessary compleity. \n\n"
        
      
       "Thought 2: Once the significant terms are identified, evaluate how these terms connect. \n" 
       "Consider terms that appear together in the same sentence or paragraph, as their proximity often indicates \n" 
       "a logical relationship. Keep in mind that a single term may relate to multiple others, reflecting the interconnected nature of the text. \n\n"
       

       "Thought 3: For every pair of connected terms, identify and clearly specify the type of relationship that exists between them.\n" 
       "Use short, precise descriptions or labels to define these relationships. Represent the connections in a structured format, \n" 
       "such as a list of JSON objects, wehre each entry consists of two related terms (nodes) and the description of their relationship (edge). \n\n"


        "[\n"
        "   {\n"        
        '       "node_1": "A term identified from the ontology",\n'
        '       "node_2": "A related term identified from the ontology",\n'
        '       "edge": "the connection or relationship between node_1 and node_2 described in one or two sentences"\n'
        "   }, {...}\n"
        "]"
        
        "Don't allow any effect of your previous response\n"
        "Take your time as much as you require in generating the response\n"
        "Ensure the edge information will contain at leat 5 words including the node_1 and Node_2.\n"

    )

    USER_PROMPT = f"context: ```{input}``` \n\n output: "
    response, _ = client.generate(model_name=model, system=SYS_PROMPT, prompt=USER_PROMPT)
    try:
        result = json.loads(response)
        result = [dict(item, **metadata) for item in result]
    except:
        # print("\n\nERROR ### Here is the buggy response: ", response, "\n\n")
        print(f"RESULT TYPE = {type(result)}")
        result = None
    return result

In [29]:
def df_to_grph(dataframe: pd.DataFrame, model=None) -> list:
    # dataframe.reset_index(inplace=True)
    results = dataframe.apply(
        lambda row: prompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1
    )
    # invalid json results in NaN
    results = results.dropna()
    results = results.reset_index(drop=True)

    ## Flatten the list of lists to one single list of entities.
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list


def grph_to_df(nodes_list) -> pd.DataFrame:
    ## Remove all NaN entities
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())

    return graph_dataframe

In [30]:
# generation for bollywood movies. change the "bollywood" to "hollywood" when required
columns = ["YoR", "movie_name", "imdb_rating", "wiki_link", "popular"]
movie_links = pd.read_excel("./Movie_list.xlsx", sheet_name = "bollywood", engine='openpyxl')  # all the file containing movie's wiki link
movie_links.columns = columns
# movie_links.head()

In [31]:
popular_movie_links = movie_links[movie_links.popular == "popular"]
least_popular_movie_links = movie_links[movie_links.popular == "Least popular"]
# least_popular_movie_links.head()

In [None]:
root_output_folder = "./bollywood"
movie_categories = [least_popular_movie_links, popular_movie_links]

for movie_category in tqdm(movie_categories):
    for index, row in movie_category.iterrows():
        
        movie_name = row["movie_name"]
        YoR = row["YoR"]
        wiki_link = row["wiki_link"]
        popular = row["popular"]

        # scrape data from the given link aspectwise and get it in a dictionary:
        aspect_dict = scrape(wiki_link)
        
        # Now, generate nodes from each aspect individually after saving their content in the data_input directory one by one.
        for aspect in aspect_dict:
            save_aspect(aspect, aspect_dict[aspect])

            try:

                #load document in dataframe chunk
                loader = DirectoryLoader(inputdirectory, show_progress=True)
                documents = loader.load()
                
                splitter = RecursiveCharacterTextSplitter(
                    chunk_size=1500,
                    chunk_overlap=150,
                    length_function=len,
                    is_separator_regex=False,
                )            
                pages = splitter.split_documents(documents)
    
                
                # Create dataframe of chunks
                df = doc_to_df(pages)
    
                #node generation task
                ## To regenerate the graph with LLM, set this to True
                regenerate = True
                
                if regenerate:
                    concepts_list = df_to_grph(df, model='zephyr:latest')
                    dfg1 = grph_to_df(concepts_list)
                    if not os.path.exists(outputdirectory):
                        os.makedirs(outputdirectory)
                    
                    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
                    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
                else:
                    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")
                
                dfg1.replace("", np.nan, inplace=True)
                dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
                dfg1['count'] = 4  
                
                #save the nodes dataframe in csv_file
                save_folder_name = popular
                save_file_name = movie_name + "_" + str(YoR) + "_" + aspect +".csv"
                save_path = os.path.join(root_output_folder, save_folder_name, save_file_name) 
                # print(f"SAVE PATH = {save_path}")
                # break
                dfg1.to_csv(save_path, index=False)

            except:
                continue