In [None]:
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "V28"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "TPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "#**Steps Overview**\n",
        "Data Extraction: Extract text from CSV, DOCX, and PDF files.\n",
        "Data Preprocessing: Convert extracted text into a suitable format for training.\n",
        "Fine-Tuning with PEFT (LoRA or QLoRA): Fine-tune the LLaMA2 model using the extracted and preprocessed data.\n",
        "Deploy and Test: Deploy the model and create a user interface for querying the chatbot."
      ],
      "metadata": {
        "id": "xNE6yBKyUCA3"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "#1. Data Extraction"
      ],
      "metadata": {
        "id": "mB3wlFeXUIBg"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "q_NjR_WwT0-0",
        "outputId": "a3e7523d-71cf-4a61-db38-7d1e9657eff5"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting python-docx\n",
            "  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)\n",
            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/244.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━\u001b[0m \u001b[32m174.1/244.3 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m244.3/244.3 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: lxml>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from python-docx) (4.9.4)\n",
            "Requirement already satisfied: typing-extensions>=4.9.0 in /usr/local/lib/python3.10/dist-packages (from python-docx) (4.12.2)\n",
            "Installing collected packages: python-docx\n",
            "Successfully installed python-docx-1.1.2\n",
            "Collecting PyPDF2\n",
            "  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: PyPDF2\n",
            "Successfully installed PyPDF2-3.0.1\n"
          ]
        }
      ],
      "source": [
        "import pandas as pd\n",
        "!pip install python-docx\n",
        "import docx\n",
        "!pip install PyPDF2\n",
        "import PyPDF2\n",
        "\n",
        "def extract_text_from_csv(file_path):\n",
        "    df = pd.read_csv(file_path)\n",
        "    text = \"\\n\".join(df.apply(lambda row: \" \".join(row.astype(str)), axis=1).tolist())\n",
        "    return text\n",
        "\n",
        "\n",
        "def extract_text_from_docx(file_path):\n",
        "    doc = docx.Document(file_path)\n",
        "    text = \"\\n\".join([paragraph.text for paragraph in doc.paragraphs])\n",
        "    return text\n",
        "\n",
        "\n",
        "def extract_text_from_pdf(file_path):\n",
        "    text = \"\"\n",
        "    with open(file_path, 'rb') as file:\n",
        "        # Use PdfReader instead of PdfFileReader\n",
        "        reader = PyPDF2.PdfReader(file)\n",
        "        for page in range(len(reader.pages)): # Use len(reader.pages) to get number of pages\n",
        "            text += reader.pages[page].extract_text() # Use pages attribute to access pages\n",
        "    return text"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "#2**. Data Preprocessing**\n",
        "Combine extracted text from different files into a single dataset."
      ],
      "metadata": {
        "id": "SOVPz7cAUbgl"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def combine_texts(texts):\n",
        "    return \"\\n\".join(texts)"
      ],
      "metadata": {
        "id": "g82lC5-_UaU3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#**Format Extracted Texts for Llama2 tarining**\n",
        "\n",
        "looks like: <|input|>What is the capital of France?<|endofinput|><|output|>The capital of France is Paris.<|endofoutput|>\n",
        "<|input|>Explain the theory of relativity.<|endofinput|><|output|>The theory of relativity is a theory by Albert Einstein which states that space and time are relative and all motion must be defined relative to a frame of reference.<|endofoutput|>\n",
        "..."
      ],
      "metadata": {
        "id": "53cPepwvU_sp"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Extract and combine texts\n",
        "texts = [\n",
        "    extract_text_from_csv('/content/test.csv'),\n",
        "    extract_text_from_docx('/content/abcd.docx'),\n",
        "    extract_text_from_pdf('/content/Machine Learning Engineer-Dipesh Shome.pdf')\n",
        "]\n",
        "combined_text = combine_texts(texts)\n",
        "\n",
        "# Define a template for LLaMA2 training\n",
        "template = \"\"\"\n",
        "<|input|>{input_text}<|endofinput|><|output|>{output_text}<|endofoutput|>\n",
        "\"\"\"\n",
        "\n",
        "# Function to create formatted text\n",
        "def create_formatted_text(input_texts):\n",
        "    formatted_texts = []\n",
        "    for text in input_texts:\n",
        "        input_text = text  # In real use case, split the text into meaningful input and output pairs\n",
        "        output_text = \"Processed response based on the input text\"  # Placeholder\n",
        "        formatted_texts.append(template.format(input_text=input_text, output_text=output_text))\n",
        "    return \"\\n\".join(formatted_texts)\n",
        "\n",
        "# Create formatted text for training\n",
        "formatted_text = create_formatted_text([combined_text])\n"
      ],
      "metadata": {
        "id": "8s9Yx-CzUpu4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Create formatted text for training\n",
        "formatted_text = create_formatted_text([combined_text])\n",
        "3. Save Formatted Data for Training\n",
        "Save the formatted text to a file which will be used for training the model.\n"
      ],
      "metadata": {
        "id": "A3d-JjMQXGN6"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "with open('formatted_data.txt', 'w') as f:\n",
        "    f.write(formatted_text)"
      ],
      "metadata": {
        "id": "vt7pxaYrYziU"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "formatted_text"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 196
        },
        "id": "LCrcnGscY9QZ",
        "outputId": "3e68d9d2-4f3c-43b3-80c4-48151a02d7c3"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "\"\\n<|input|>female group C associate's degree standard none 91 86 84\\nfemale group B some college free/reduced completed 53 66 73\\nmale group D bachelor's degree standard none 80 73 72\\nmale group C some college free/reduced none 74 77 73\\nmale group E some college standard completed 84 83 78\\nmale group D associate's degree free/reduced none 81 75 78\\nmale group B associate's degree free/reduced completed 69 70 63\\nfemale group B some high school standard completed 54 61 62\\nmale group C associate's degree free/reduced none 87 73 72\\nmale group B some high school standard completed 51 54 41\\nmale group A high school free/reduced none 45 47 49\\nmale group E some high school standard none 30 26 22\\nfemale group B high school free/reduced completed 67 80 81\\nfemale group D some college free/reduced none 49 65 61\\nmale group D some college standard completed 85 81 85\\nfemale group D some high school standard completed 65 78 82\\nmale group D high school standard none 53 52 42\\nmale group D bachelor's degree free/reduced none 55 46 44\\nfemale group D some high school standard none 48 58 54\\nfemale group D associate's degree free/reduced none 56 65 63\\nmale group C master's degree standard none 79 72 69\\nfemale group C bachelor's degree free/reduced completed 43 51 54\\nfemale group C some college free/reduced completed 45 73 70\\nfemale group C high school free/reduced none 36 53 43\\nmale group D some high school free/reduced completed 80 79 79\\nmale group D associate's degree standard none 80 75 77\\nmale group D bachelor's degree standard completed 68 74 74\\nfemale group C associate's degree standard none 40 59 51\\nfemale group A high school free/reduced completed 34 48 41\\nfemale group D some college free/reduced none 49 58 60\\nmale group B some college standard none 62 61 57\\nmale group D some college standard completed 71 61 69\\nmale group B bachelor's degree free/reduced none 62 63 56\\nmale group E some college standard none 76 71 72\\nmale group E some college standard none 84 77 71\\nfemale group B some college free/reduced none 45 53 55\\nmale group D associate's degree free/reduced none 77 78 73\\nfemale group D some college standard none 69 77 77\\nfemale group C master's degree standard none 73 78 74\\nfemale group C some high school free/reduced none 0 17 10\\nmale group C associate's degree standard completed 82 75 77\\nmale group B some high school standard completed 65 66 62\\nmale group D bachelor's degree standard completed 67 61 68\\nfemale group A some college standard none 54 63 67\\nmale group D associate's degree free/reduced none 90 87 75\\nfemale group E high school standard completed 59 63 75\\nmale group D high school free/reduced none 74 70 69\\nfemale group C high school standard none 29 29 30\\nmale group D some high school standard completed 89 88 82\\nfemale group A high school standard completed 75 82 79\\nmale group B some college standard completed 71 75 70\\nfemale group D associate's degree standard none 64 76 74\\nmale group C some college standard completed 79 79 78\\nfemale group B some college free/reduced completed 48 56 58\\nfemale group C some high school standard none 69 73 73\\nfemale group D some college standard none 69 74 74\\nmale group D bachelor's degree standard none 88 78 83\\nmale group C associate's degree standard none 58 54 52\\nmale group B associate's degree standard none 87 85 73\\nfemale group A some high school standard completed 85 90 92\\nmale group A some high school standard completed 46 41 43\\nfemale group C some high school free/reduced completed 71 84 87\\nfemale group C associate's degree standard none 81 77 79\\nfemale group B some college free/reduced none 58 61 66\\nmale group D master's degree free/reduced completed 84 89 90\\nfemale group C bachelor's degree free/reduced completed 66 74 81\\nfemale group D some college free/reduced none 55 71 69\\nmale group C high school free/reduced none 59 53 52\\nfemale group D some college free/reduced completed 58 63 73\\nfemale group D associate's degree standard none 82 95 89\\nmale group E associate's degree standard completed 66 63 64\\nfemale group C bachelor's degree standard none 81 88 90\\nmale group C some college free/reduced none 58 57 54\\nfemale group A associate's degree free/reduced none 37 57 56\\nmale group C some high school standard completed 63 60 57\\nmale group E some high school standard completed 77 76 77\\nfemale group D some college standard completed 85 86 98\\nmale group B associate's degree free/reduced none 57 56 57\\nfemale group A some high school standard none 48 66 65\\nmale group C some high school standard none 51 52 44\\nmale group D some college free/reduced none 63 61 60\\nmale group D some high school free/reduced none 45 37 37\\nmale group C bachelor's degree standard none 83 78 73\\nfemale group C some college standard none 60 72 74\\nmale group B bachelor's degree standard none 63 71 69\\nfemale group C high school free/reduced none 62 67 64\\nfemale group D some college standard completed 68 78 77\\nfemale group B some high school standard completed 60 70 74\\nfemale group C some high school standard completed 77 90 85\\nmale group A some college free/reduced none 28 23 19\\nmale group C master's degree free/reduced none 79 81 71\\nfemale group E some college standard none 100 92 97\\nmale group D bachelor's degree standard none 69 58 57\\nmale group B high school free/reduced none 66 77 70\\nfemale group B some college standard none 19 38 32\\nmale group D associate's degree standard none 75 68 64\\nmale group D some college standard none 60 63 59\\nfemale group A some college standard none 58 70 67\\nfemale group C associate's degree standard none 69 80 71\\nfemale group C associate's degree free/reduced completed 56 68 70\\nmale group C associate's degree standard completed 73 78 72\\nmale group E some college standard none 66 57 52\\nmale group A associate's degree standard none 67 57 53\\nfemale group C associate's degree free/reduced none 64 73 68\\nmale group A high school standard none 71 74 64\\nmale group B high school standard none 70 65 60\\nmale group E associate's degree standard none 53 45 40\\nmale group C high school standard none 75 81 71\\nfemale group B high school standard completed 68 83 78\\nfemale group C high school standard none 44 61 52\\nfemale group D bachelor's degree free/reduced none 29 41 47\\nfemale group B high school free/reduced none 71 87 82\\nmale group A high school standard none 57 51 54\\nfemale group A bachelor's degree standard none 45 59 64\\nfemale group C some college free/reduced none 76 83 88\\nmale group C high school standard none 61 56 55\\nmale group C some high school free/reduced completed 45 52 49\\nmale group D high school standard completed 55 41 48\\nmale group B high school standard completed 73 69 68\\nmale group D high school free/reduced completed 78 77 80\\nfemale group A master's degree free/reduced none 50 67 73\\nfemale group C some college free/reduced none 62 67 62\\nmale group D master's degree standard none 81 81 84\\nfemale group C some high school free/reduced completed 64 79 77\\nfemale group D some high school standard completed 64 60 74\\nmale group D some high school standard none 73 66 62\\nfemale group D associate's degree standard completed 73 75 80\\nfemale group C some high school standard completed 67 74 77\\nmale group B associate's degree standard none 61 42 41\\nmale group C some high school standard completed 67 73 68\\nfemale group D some high school standard none 65 82 81\\nmale group D associate's degree standard none 80 75 69\\nmale group D some high school free/reduced none 59 42 41\\nfemale group E master's degree standard completed 88 99 95\\nfemale group C associate's degree standard none 62 74 70\\nfemale group C high school free/reduced none 33 41 43\\nfemale group C bachelor's degree standard completed 79 92 89\\nmale group B some high school standard completed 84 83 75\\nmale group A master's degree free/reduced none 73 74 72\\nfemale group A associate's degree free/reduced none 41 51 48\\nfemale group E associate's degree free/reduced none 50 56 54\\nfemale group B high school standard completed 58 70 68\\nmale group D some high school free/reduced completed 55 59 59\\nmale group D high school standard none 45 48 46\\nmale group D some high school standard completed 88 74 75\\nfemale group B associate's degree free/reduced none 46 61 55\\nmale group A some high school standard none 51 31 36\\nmale group D some high school standard none 75 74 69\\nmale group E some college free/reduced completed 49 52 51\\nfemale group E high school standard none 75 86 79\\nfemale group E high school standard completed 74 79 80\\nfemale group B associate's degree standard completed 61 86 87\\nmale group C associate's degree standard none 62 65 58\\nmale group C some high school free/reduced none 68 63 54\\nfemale group D master's degree standard none 78 91 96\\nfemale group E some college standard none 71 70 76\\nfemale group D high school free/reduced none 49 57 52\\nfemale group A bachelor's degree standard none 59 72 70\\nmale group E bachelor's degree free/reduced completed 79 74 72\\nfemale group E associate's degree standard none 51 51 54\\nfemale group C bachelor's degree standard completed 56 79 72\\nmale group B high school standard completed 76 62 60\\nfemale group D some college standard completed 69 79 81\\nmale group C some high school free/reduced completed 51 56 53\\nmale group D some college standard completed 82 82 88\\nmale group C some college standard none 73 74 61\\nmale group C high school free/reduced completed 40 46 50\\nmale group E some college free/reduced none 93 90 83\\nfemale group C bachelor's degree standard completed 59 64 75\\nfemale group B associate's degree standard none 73 76 80\\nmale group B some high school standard completed 85 84 78\\nmale group E associate's degree standard none 76 71 67\\nfemale group D associate's degree free/reduced completed 77 89 98\\nfemale group D some college free/reduced completed 67 86 83\\nmale group D some college free/reduced none 61 47 56\\nfemale group D some high school free/reduced none 27 34 32\\nmale group D high school standard none 54 52 52\\nfemale group C master's degree free/reduced completed 65 81 81\\nfemale group E associate's degree standard none 87 94 95\\nfemale group C some high school standard completed 70 82 76\\nfemale group B high school standard none 54 64 68\\nfemale group C high school free/reduced none 66 76 68\\nfemale group D master's degree free/reduced completed 85 95 100\\nmale group C some high school free/reduced completed 56 61 60\\nmale group E master's degree standard none 90 85 84\\nmale group E high school standard none 70 55 56\\nfemale group B bachelor's degree standard none 61 72 70\\nmale group A bachelor's degree free/reduced completed 49 58 60\\nmale group C high school standard none 81 66 64\\nmale group B some college standard completed 87 84 86\\nmale group B some high school free/reduced completed 49 50 52\\nmale group B some high school standard none 68 54 53\\nmale group C associate's degree free/reduced none 77 67 64\\nfemale group B bachelor's degree free/reduced none 78 79 76\\nmale group C associate's degree free/reduced completed 60 51 56\\nfemale group D high school free/reduced completed 52 57 56\\nmale group E associate's degree standard completed 62 56 53\\nfemale group B some college free/reduced none 74 81 76\\nfemale group C associate's degree standard none 65 77 74\\nfemale group D some high school standard completed 61 74 72\\nAnindita Dey\\nContact: +8801850891617, Email: mishuanindita@gmail.com, \\nAddress: Dhaka, Bangladesh\\n\\nResearch Experience: \\nPostgraduate Research work (March 2023-September 2023)\\nResearch: Muscle structure, chemical composition, gelation characteristics of fish muscle paste\\nThesis: Study on Structural Characteristics and Composition of Two Marine Fish Species (Dussumieria acuta, Sardinella fimbriata) of Bangladesh.\\nUndergraduate research work (August 2021-December 2021)\\nThesis: Preparation of Quality Dry Fish (Harpadon nehereus) Using Improved Drying Techniques\\nResearch Supervisor: Dr. Mohammed Nurul Absar Khan \\nChattogram Veterinary and Animal Sciences University (CVASU)\\n\\nInternship: \\nCoastal Biodiversity, Marine Fisheries and Wildlife Research Center, CVASU,\\nMarine Fisheries Technology Station, Bangladesh Fisheries Research Institute (BFRI), COX’S Bazar.\\nRiverine Substation, Rangamati, BFRI\\n\\nAcademic Credentials: \\nMaster of Science in Fishing and Post-Harvest Technology,\\nChattogram Veterinary and Animal Sciences University (CVASU)\\nJanuary 2022- October 2023\\nCGPA: 3.85/4.00\\nBachelor of Science in Fisheries\\nFaculty of Fisheries\\nChattogram Veterinary and Animal Sciences University (CVASU)\\nJanuary 2017- December 2021\\nCGPA:3.78/4.0\\n\\n\\nTechnical skill: \\nData collection, Laboratory skills, Scientific Writing, Research Data Analysis, Statistical Analysis (SPSS), MS Office.\\n\\nExtra-Curricular Activities: \\nMember, Rotaract Club of Chattogram Veterinary and Animal Sciences University. Volunteer, ‘Shishuder jonyo’ a voluntary organization working toward socially deprived children welfare.\\n\\nLanguage Proficiency: \\nBengali: Native language\\nEnglish: proficient user (IELTS score: 7)\\n\\nReferences:\\nDr. Mohammed Nurul Absar Khan,                      Prof. Dr. Md. Kamal\\nDean, Faculty of Fisheries, CVASU                     Treasurer, CVASU\\nEmail: mnkabsar@yahoo.com                              Email:\\xa0mkamal772011@gmail.com\\nResearch Supervisor\\t\\t\\t                   Research co-supervisor\\n\\n\\n\\n\\n\\n\\n\\nDipesh\\nShome\\nContact\\n:\\n+8801822409776\\nEmail\\n:\\ndshome.cse@gmail.com\\nAddress\\n:\\nDhaka,\\nBangladesh\\nGitHub\\n:\\nDipesh\\nShome\\nLinkedIn\\n:\\nDipesh\\nShome\\nOverview\\nPassionate\\nabout\\nleveraging\\nmy\\nexpertise\\nin\\nData\\nScience\\nto\\ndevelop\\ninnovative\\nsolutions,\\nwhile\\nshowcasing\\na\\nkeen \\ninterest\\nin\\noptimizing\\nand\\nscaling\\nMachine\\nLearning\\noperations.\\nEager\\nto\\ncontribute\\nto\\na\\ndynamic\\nteam,\\nintegrating \\ncutting-edge\\ntechnologies\\nto\\ndrive\\nefficiency\\nand\\ndeliver\\nimpactful\\nresults\\nin\\nthe\\nrealm\\nof\\nResearch\\nand\\nDevelopment\\nProfessional\\nexperience\\nSoftware\\nEngineer,\\nDot\\nBD\\nSolutions\\nLimited\\n(\\nApril\\n2022\\n-\\nPresent\\n)\\n\\x00\\nImplement\\nETL\\npipeline,\\nautomate\\nthe\\ndata\\ncollection\\nprocess,\\nand\\ndevelop\\ninteractive\\nreports\\nand\\ndashboards\\nto\\nvisualize\\nthe\\ninsights\\nof\\ndata\\nand\\nhelp\\nin\\ndecision-making\\n,\\nreducing\\nreport\\npreparation\\ntime\\nby\\n50%.\\n\\x00\\nDeveloped\\npredictive\\nmodels\\nusing\\nmachine\\nlearning,\\ndeep\\nlearning,\\nand\\nrecommender\\nsystems\\nusing\\nNLP\\nand\\ndeployed\\nthem\\nwith\\nthe\\nMLOps\\npipeline.\\nResearch\\nAssistant,\\nIndian\\nInstitute\\nof\\nTechnology\\nKharagpur.\\n(\\nNovember\\n2022\\n–\\nJanuary\\n2023)\\n\\x00\\nWorked\\non\\na\\nproject\\nnamed\\nClassification\\nof\\nBrain\\nSensorimotor\\nRhythms\\nUtilizing\\nEEG-based\\nFunctional\\nConnectivity.\\nTraining\\nand\\nCertification\\nTraining\\nof\\nTrainers\\n(TOT),\\nEstablishment\\nof\\nBharat\\nBangladesh\\nDigital\\nService\\nand\\nEmployment\\n(\\nBDSET\\n)\\nJune\\n2022\\n–\\nJanuary\\n2023\\n●\\nIssued\\nby:\\nIndian\\nInstitute\\nof\\nTechnology,\\nKharagpur\\n●\\nAuthorized\\nby\\n:\\nBHTPA,\\nICT\\nDivision,\\nBangladesh\\n2023.\\n●\\nTopics\\n:\\nStatistical\\nLearning,\\nFeature\\nEngineering,\\nML\\nAlgorithms,\\nEnsemble\\nLearning,\\nClustering,\\nSVD,\\nPCA, \\nANN,\\nCNN,\\nArchitecture,\\nTransfer\\nLearning,\\nDeep\\nLearning.\\nSkills\\nMatrix\\nProgramming\\nLanguages\\nC,\\nC++,\\nPython,\\nJavaScript\\nDatabase\\nSQL,\\nMySQL,\\nPostgreSQL\\nScientific\\nProgramming\\nLibraries\\nScipy,\\nNumpy,\\nPandas\\nBusiness\\nIntelligence\\nand\\nData\\nVisualization\\nPowerBI,\\nMatplotlib,\\nSeaborn\\nMachine\\nLearning\\nand\\nDeep\\nLearning\\nFrameworks\\nPyTorch,\\nTensorFlow,\\nKeras\\nOrchestration\\nand\\nIntegration\\nTools\\nApache\\nAirflow,\\nSQL\\nServer\\nIntegration\\nServices\\n(SSIS),\\nSQL \\nServer\\nAnalysis\\nServices\\n(SSAS)\\nVersion\\nControl\\nand\\nAutomation\\nTools\\nGit,\\nGitHub,\\nCI/CD,\\nDVC,\\nDockers\\nArtificial\\nIntelligence\\nand\\nMLOps\\nMachine\\nLearning,\\nDeep\\nLearning,\\nMLOps,\\nNLP,\\nLLM\\nChatbotEducation\\nAhsanullah\\nUniversity\\nof\\nScience\\nand\\nTechnology,\\nDecember\\n2016\\n–\\nJune\\n2021\\nBachelor\\nof\\nScience\\nin\\nComputer\\nScience\\nand\\nEngineering.\\nCGPA\\n3.40\\nout\\nof\\n4.00\\nResearches\\nRoad\\nQuality\\nMeasurement\\nfor\\nNational\\nHighway\\nof\\nBangladesh\\nfrom\\nHigh-Resolution\\nSatellite\\nImages.∥\\nPython, \\nComputer\\nVision,\\nSatellite\\nImage\\nProcessing,\\nTensorflow,\\nPytorch,\\nDeep\\nLearning\\n\\x00\\nCreated\\nroad\\ndataset\\nand\\nbuilt\\nan\\nautomated\\nroad\\nquality\\nmeasurement\\nmodel\\nfor\\nBangladesh\\nto\\nclassify\\nthe\\nnational\\nhighway\\nof\\nBangladesh\\nfrom\\nIRI\\nand\\nhigh-resolution\\nsatellite\\nimages\\nwhich \\nacquired\\nmore\\nthan\\n75%\\naccuracy.\\nClassification\\nof\\nBrain\\nSensorimotor\\nRhythms\\nUtilizing\\nEEG-based\\nFunctional\\nConnectivity.∥\\nStatistical\\nAnalysis, \\nPython,\\nPandas,\\nNumpy,\\nSNS,\\nSeaborn,\\nMatplotlib.\\n\\x00\\nApplied\\ncorrelation\\ncoefficient,\\nphase\\nlag\\nindex\\nfor\\nstatistical\\nanalysis\\nand\\nmachine\\nlearning\\nmodel\\n:\\nKNN,\\nSVM\\nwith\\ngrid\\nsearch\\n,\\nDeep\\nLearning\\nmod\\nel:\\nCNN,\\nand\\nEnsemble\\nmodel\\nfor\\nclassifications\\nwith\\nmore\\nthan \\n10%\\naccuracy\\nthan\\nprevious.\\nPublications\\nConference\\non\\nProceedings:\\nIGARSS\\n2022\\nProf.\\nDr.\\nKazi\\nA\\nKalpoma,\\nDipesh\\nShome,\\nAnas\\nSikder,\\nAbrar\\nJahin.\\nComprehensive\\nStudy\\non\\nRoad\\nQuality\\nMeasurement \\nfrom\\nHigh-Resolution\\nSatellite\\nImagery.\\nDOI:\\n0.1109/IGARSS46834.2022.9884463\\nProjects\\nBI\\nReport\\n-\\nReal\\nEstate\\nBusiness∥\\nPython,\\nScientific\\nComputing,\\nSQL,\\nPower\\nBI,\\nRepository\\nBI\\nDeveloped\\na\\nreport\\nand\\ndashboard\\nthrough\\nAPI\\nfollowing\\nETL,\\ndata\\nwarehousing,\\ntransformation,\\nand\\nExploratory\\nData \\nAnalysis\\nsteps\\nfor\\nsales\\nperformance\\ntracking,\\ntrend\\nanalysis,\\nforecasting,\\ninstallment\\ncollection\\nperformances,\\nand \\nCustomer\\nbehavior\\nanalysis,\\nimplemented\\npage-level\\nsecurity,\\nand\\nproviding\\nreal-time\\nvisibility\\ninto\\nsales\\nmetrics\\nDiseases\\nClassification\\nwith\\nAI\\nRecommendation\\n//\\nPython,\\nScientific\\nComputing,\\nEDA,\\nDeep\\nLearning,\\nDVC,\\nOpen\\nAI \\nAPI\\nintegration,\\nDockers,\\nAWS.\\nRepository.\\nHens\\ndisease\\nprediction\\nfrom\\nimages\\nand\\naccording\\nto\\nthe\\ndiseases\\ngenerate\\nnecessary\\nrecommendations\\nfrom\\nthe\\nLLm \\nmodel.\\nSteps\\nfollowed\\n:\\nData\\nIngestion,\\nData\\nTransformation,\\nData\\nValidation,\\nEDA,\\nTransfer\\nLearning,\\nTraining\\npipeline, \\nPrediction\\npipeline,\\nVersion\\nControlling,\\nintegration\\nOpenAI\\nAPI,\\nand\\nPipeline\\nTracking\\nwith\\nDVC\\nand\\nDeployment\\nfollowed \\nby\\nDocker\\nand\\nCI/CD\\npipeline.\\nScore\\nPrediction\\nData\\nScience\\nProject∥\\nPython,\\nEDA,\\nMachine\\nLearning,\\nMLOps,\\nAWS.\\n,\\nRepository\\nData\\nscience\\nproject\\nfor\\nscore\\nprediction.\\nSteps\\nfollowed:\\nData\\nmodeling,\\nData\\nTransformation,\\nExploratory\\nData\\nAnalysis, \\nFeature\\nengineering,\\nMachine\\nLearning\\nmodels,\\nTraining\\npipeline,\\nPrediction\\npipeline,\\nVersion\\ncontrolling\\nusing\\nDVC,\\nand \\nAWS\\nDeployment\\nstages\\nare\\nfollowed\\nwith\\nbest\\npractices.\\nRetrieval-Augmented\\nGeneration\\n(RAG)\\nsystem\\n//\\nPython,\\nNLP,\\nGenerative\\nAI,\\nPinecone,\\nOpen\\nAI\\nRepository\\nDeveloped\\nan\\nLLM\\nchatbot/RAG\\nsystem\\nby\\nextracting\\ndata\\nfrom\\ndocuments,\\nusing\\nOpenAI\\nembeddings\\nfor\\nstorage\\nin\\na \\nvector\\ndatabase\\nwith\\nindexing,\\nand\\nimplementing\\nsemantic\\nsearch\\nfor\\ninformation\\nretrieval\\nand\\nGPT\\nfor\\ncontextual \\nresponses.\\nProduct\\nRecommendation\\nSystem\\n//\\nPython,\\nEDA,\\nFeature\\nEngineering,\\nML,\\nNLP\\nDeveloped\\na\\ncontent\\n(product\\nattributes)\\nbased\\nproduct\\nrecommendation\\nsystem\\napplying\\ntokenization,\\ntagging,\\nNLTK,\\nCountVectorizer,\\nCBow,\\nand\\nCosine\\nSimilarity\\nmethod.\\nRepository.\\n<|endofinput|><|output|>Processed response based on the input text<|endofoutput|>\\n\""
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "#Fine-Tuning the LLaMA2 Model"
      ],
      "metadata": {
        "id": "2-yMuNkpVT67"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "pip install transformers peft torch"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "57YNqe84VggE",
        "outputId": "9c8e74b1-4c07-4553-df4d-f6a9f34df982"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.41.2)\n",
            "Collecting peft\n",
            "  Downloading peft-0.11.1-py3-none-any.whl (251 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m251.6/251.6 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (2.3.0+cu121)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.15.4)\n",
            "Requirement already satisfied: huggingface-hub<1.0,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.23.4)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.25.2)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.1)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2024.5.15)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
            "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.19.1)\n",
            "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.3)\n",
            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.4)\n",
            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from peft) (5.9.5)\n",
            "Collecting accelerate>=0.21.0 (from peft)\n",
            "  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m314.1/314.1 kB\u001b[0m \u001b[31m15.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch) (4.12.2)\n",
            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch) (1.13.0)\n",
            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch) (3.3)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch) (3.1.4)\n",
            "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch) (2023.6.0)\n",
            "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)\n",
            "  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n",
            "Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)\n",
            "  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n",
            "Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)\n",
            "  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n",
            "Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)\n",
            "  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n",
            "Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)\n",
            "  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n",
            "Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)\n",
            "  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n",
            "Collecting nvidia-curand-cu12==10.3.2.106 (from torch)\n",
            "  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n",
            "Collecting nvidia-cusolver-cu12==11.4.5.107 (from torch)\n",
            "  Using cached nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n",
            "Collecting nvidia-cusparse-cu12==12.1.0.106 (from torch)\n",
            "  Using cached nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n",
            "Collecting nvidia-nccl-cu12==2.20.5 (from torch)\n",
            "  Using cached nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl (176.2 MB)\n",
            "Collecting nvidia-nvtx-cu12==12.1.105 (from torch)\n",
            "  Using cached nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n",
            "Requirement already satisfied: triton==2.3.0 in /usr/local/lib/python3.10/dist-packages (from torch) (2.3.0)\n",
            "Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch)\n",
            "  Downloading nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl (21.3 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.3/21.3 MB\u001b[0m \u001b[31m57.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch) (2.1.5)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.7)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.7.4)\n",
            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy->torch) (1.3.0)\n",
            "Installing collected packages: nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, accelerate, peft\n",
            "Successfully installed accelerate-0.32.1 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.20.5 nvidia-nvjitlink-cu12-12.5.82 nvidia-nvtx-cu12-12.1.105 peft-0.11.1\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install datasets"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "2kov7cKEZXNx",
        "outputId": "b91a1085-c32e-4874-a50b-a79cb727a822"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting datasets\n",
            "  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)\n",
            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/547.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m256.0/547.8 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m547.8/547.8 kB\u001b[0m \u001b[31m8.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets) (3.15.4)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.25.2)\n",
            "Collecting pyarrow>=15.0.0 (from datasets)\n",
            "  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.8/40.8 MB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets) (0.6)\n",
            "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n",
            "  Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m13.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets) (2.0.3)\n",
            "Collecting requests>=2.32.2 (from datasets)\n",
            "  Downloading requests-2.32.3-py3-none-any.whl (64 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m64.9/64.9 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: tqdm>=4.66.3 in /usr/local/lib/python3.10/dist-packages (from datasets) (4.66.4)\n",
            "Collecting xxhash (from datasets)\n",
            "  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting multiprocess (from datasets)\n",
            "  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: fsspec[http]<=2024.5.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2023.6.0)\n",
            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets) (3.9.5)\n",
            "Requirement already satisfied: huggingface-hub>=0.21.2 in /usr/local/lib/python3.10/dist-packages (from datasets) (0.23.4)\n",
            "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (24.1)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n",
            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.3.1)\n",
            "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.2.0)\n",
            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.4.1)\n",
            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (6.0.5)\n",
            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (1.9.4)\n",
            "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (4.0.3)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.2->datasets) (4.12.2)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (3.7)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.32.2->datasets) (2024.7.4)\n",
            "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2023.4)\n",
            "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2024.1)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n",
            "Installing collected packages: xxhash, requests, pyarrow, dill, multiprocess, datasets\n",
            "  Attempting uninstall: requests\n",
            "    Found existing installation: requests 2.31.0\n",
            "    Uninstalling requests-2.31.0:\n",
            "      Successfully uninstalled requests-2.31.0\n",
            "  Attempting uninstall: pyarrow\n",
            "    Found existing installation: pyarrow 14.0.2\n",
            "    Uninstalling pyarrow-14.0.2:\n",
            "      Successfully uninstalled pyarrow-14.0.2\n",
            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 which is incompatible.\n",
            "google-colab 1.0.0 requires requests==2.31.0, but you have requests 2.32.3 which is incompatible.\n",
            "ibis-framework 8.0.0 requires pyarrow<16,>=2, but you have pyarrow 16.1.0 which is incompatible.\u001b[0m\u001b[31m\n",
            "\u001b[0mSuccessfully installed datasets-2.20.0 dill-0.3.8 multiprocess-0.70.16 pyarrow-16.1.0 requests-2.32.3 xxhash-3.4.1\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "application/vnd.colab-display-data+json": {
              "pip_warning": {
                "packages": [
                  "pyarrow"
                ]
              },
              "id": "43c82987d6c44f0cba0c56602069a2c6"
            }
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from transformers import LlamaForCausalLM, LlamaTokenizer\n",
        "from peft import get_peft_model, LoraConfig, TaskType\n",
        "from datasets import Dataset\n",
        "from transformers import Trainer, TrainingArguments\n",
        "\n",
        "def train_model(data):\n",
        "    # Load tokenizer and model\n",
        "    tokenizer = LlamaTokenizer.from_pretrained('NousResearch/Llama-2-7b-chat-hf')\n",
        "    model = LlamaForCausalLM.from_pretrained('NousResearch/Llama-2-7b-chat-hf')\n",
        "\n",
        "    # Prepare the dataset\n",
        "    dataset = Dataset.from_dict({\"text\": [data]})\n",
        "\n",
        "    def tokenize_function(examples):\n",
        "        return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True, max_length=512)\n",
        "\n",
        "    tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
        "\n",
        "    # Define LoRA configuration\n",
        "    lora_config = LoraConfig(\n",
        "        task_type=TaskType.CAUSAL_LM,\n",
        "        r=4,  # rank of the low-rank matrices\n",
        "        lora_alpha=16,\n",
        "        lora_dropout=0.1,\n",
        "        target_modules=['q_proj', 'v_proj']  # modules to apply LoRA\n",
        "    )\n",
        "\n",
        "    # Apply LoRA to the model\n",
        "    model = get_peft_model(model, lora_config)\n",
        "\n",
        "    # Set up training arguments\n",
        "    training_args = TrainingArguments(\n",
        "        output_dir=\"./results\",\n",
        "        evaluation_strategy=\"epoch\",\n",
        "        learning_rate=2e-5,\n",
        "        per_device_train_batch_size=4,\n",
        "        per_device_eval_batch_size=4,\n",
        "        num_train_epochs=3,\n",
        "        weight_decay=0.01,\n",
        "    )\n",
        "\n",
        "    # Initialize Trainer\n",
        "    trainer = Trainer(\n",
        "        model=model,\n",
        "        args=training_args,\n",
        "        train_dataset=tokenized_datasets,\n",
        "    )\n",
        "\n",
        "    # Fine-tune the model with LoRA\n",
        "    trainer.train()\n",
        "\n",
        "    # Save the fine-tuned model\n",
        "    model.save_pretrained('./results')\n",
        "    tokenizer.save_pretrained('./results')\n"
      ],
      "metadata": {
        "id": "cUf6ehGAVUDQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "#Deploy and Test\n",
        "Flask Backend (app.py)"
      ],
      "metadata": {
        "id": "p2_OcDKgV-Wq"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from flask import Flask, request, jsonify\n",
        "from flask_cors import CORS\n",
        "from transformers import pipeline, LlamaTokenizer, LlamaForCausalLM\n",
        "\n",
        "app = Flask(__name__)\n",
        "CORS(app)\n",
        "\n",
        "# Load fine-tuned model and tokenizer\n",
        "model = LlamaForCausalLM.from_pretrained('./results')\n",
        "tokenizer = LlamaTokenizer.from_pretrained('./results')\n",
        "\n",
        "# Create a pipeline for text generation\n",
        "chatbot = pipeline(\"text-generation\", model=model, tokenizer=tokenizer)\n",
        "\n",
        "@app.route('/query', methods=['POST'])\n",
        "def query():\n",
        "    user_input = request.json['input']\n",
        "    response = chatbot(user_input, max_length=100)\n",
        "    return jsonify(response[0]['generated_text'])\n",
        "\n",
        "if __name__ == '__main__':\n",
        "    app.run(debug=True)\n"
      ],
      "metadata": {
        "id": "6HD3QOatV9bz"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "IZFDUheuV8_o"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}