***<h1 style="text-align: center;">ETL pipeline using Kaggle, Databricks & Snowflake</h1>***

### Installation of Kaggle & Snowflake connector

In [0]:
%pip install kaggle
%pip install snowflake-connector-python

Python interpreter will be restarted.
Collecting kaggle
  Downloading kaggle-1.5.16.tar.gz (83 kB)
Collecting tqdm
  Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
Collecting python-slugify
  Downloading python_slugify-8.0.1-py2.py3-none-any.whl (9.7 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py): started
  Building wheel for kaggle (setup.py): finished with status 'done'
  Created wheel for kaggle: filename=kaggle-1.5.16-py3-none-any.whl size=110699 sha256=3e97027b25b857b3a2426c3e1d1e701b298c14476090f7599c1d96854aef6ec9
  Stored in directory: /root/.cache/pip/wheels/d2/ed/a5/da3a0cfb13373d1ace41cafa4f2467d858c55c52473ba72799
Successfully built kaggle
Installing collected packages: text-unidecode, tqdm, python-slugify, kaggle
Successfully installed kaggle-1.5.16 python-slugify-8.0.1 text-unidecode-1.3 tqdm-4.66.1
Python interpreter will be restarted.

### Function to setup Kaggle Authentication

In [0]:
import json
def kaggle_authentication(kaggle_auth, output_file):
    try:
        data = json.loads(kaggle_auth)
        with open(output_file, 'w') as file:
            json.dump(data, file, indent=4)
        print('successful authentication')
    except:
        print('authentication failed')

In [0]:
kaggle_authentication('{"username":"majdiakrmi","key":"3aa4b20c5ac5c0e1c8c82725d3cd3ffb"}', '/root/.kaggle/kaggle.json')

successful authentication


In [0]:
%sh
cat /root/.kaggle/kaggle.json

{
    "username": "majdiakrmi",
    "key": "3aa4b20c5ac5c0e1c8c82725d3cd3ffb"
}

### List Kaggle datasets

In [0]:
%sh kaggle datasets list

ref                                                        title                                         size  lastUpdated          downloadCount  voteCount  usabilityRating  
---------------------------------------------------------  -------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
nelgiriyewithana/top-spotify-songs-2023                    Most Streamed Spotify Songs 2023              47KB  2023-08-26 11:04:57          12857        417  1.0              
joebeachcapital/students-performance                       Students Performance                           2KB  2023-08-31 00:50:11           4339        115  1.0              
carlmcbrideellis/zzzs-lightweight-training-dataset-target  Zzzs: Lightweight training dataset + target  185MB  2023-09-20 19:52:40            461         69  1.0              
iamsouravbanerjee/airline-dataset                          Airline Dataset                                8MB  2023-09-1

### Restrict the read permission of the file

In [0]:
%sh
chmod 600 /root/.kaggle/kaggle.json

## <span style="color:blue"> 1) Extract Data </span>

### Download Kaggle Dataset

In [0]:
%sh
kaggle datasets download -d sujaykapadnis/programming-languages

Downloading programming-languages.zip to /databricks/driver

  0%|          | 0.00/859k [00:00<?, ?B/s]
100%|██████████| 859k/859k [00:00<00:00, 18.2MB/s]



### List download files

In [0]:
dbutils.fs.ls('file:/databricks/driver')

Out[55]: [FileInfo(path='file:/databricks/driver/azure/', name='azure/', size=4096, modificationTime=1695272205540),
 FileInfo(path='file:/databricks/driver/preload_class.lst', name='preload_class.lst', size=1306936, modificationTime=1695272205564),
 FileInfo(path='file:/databricks/driver/conf/', name='conf/', size=4096, modificationTime=1695272204968),
 FileInfo(path='file:/databricks/driver/hadoop_accessed_config.lst', name='hadoop_accessed_config.lst', size=2755, modificationTime=1695272205552),
 FileInfo(path='file:/databricks/driver/eventlogs/', name='eventlogs/', size=4096, modificationTime=1695318593829),
 FileInfo(path='file:/databricks/driver/root\\.kaggle\\kaggle.json', name='root\\.kaggle\\kaggle.json', size=79, modificationTime=1695324098873),
 FileInfo(path='file:/databricks/driver/metastore_db/', name='metastore_db/', size=4096, modificationTime=1695318919009),
 FileInfo(path='file:/databricks/driver/logs/', name='logs/', size=4096, modificationTime=1695326497776),
 FileI

### Unzip the files

In [0]:
%sh
unzip /databricks/driver/programming-languages.zip

Archive:  /databricks/driver/programming-languages.zip
  inflating: languages.csv           


In [0]:
dbutils.fs.ls('file:/databricks/driver/languages.csv')

Out[64]: [FileInfo(path='file:/databricks/driver/languages.csv', name='languages.csv', size=2618104, modificationTime=1694844548000)]

### Move unzipped files from driver mount to Filestore (Bronze Layer)

In [0]:
dbutils.fs.mv('file:/databricks/driver/languages.csv', 'dbfs:/FileStore/', recurse=True)

Out[65]: True

### List Filestore to see the moved files

In [0]:
dbutils.fs.ls('dbfs:/FileStore')

Out[68]: [FileInfo(path='dbfs:/FileStore/languages.csv', name='languages.csv', size=2618104, modificationTime=1695328536000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/', name='shared_uploads/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/', name='tables/', size=0, modificationTime=0)]

## <span style="color:blue"> 2) Transform Data </span>

### Spark Read CSV 

In [0]:
df = spark.read.csv('dbfs:/FileStore/languages.csv', header=True)

In [0]:
display(df)

pldb_id,title,description,type,appeared,creators,website,domain_name,domain_name_registered,reference,isbndb,book_count,semantic_scholar,language_rank,github_repo,github_repo_stars,github_repo_forks,github_repo_updated,github_repo_subscribers,github_repo_created,github_repo_description,github_repo_issues,github_repo_first_commit,github_language,github_language_tm_scope,github_language_type,github_language_ace_mode,github_language_file_extensions,github_language_repos,wikipedia,wikipedia_daily_page_views,wikipedia_backlinks_count,wikipedia_summary,wikipedia_page_id,wikipedia_appeared,wikipedia_created,wikipedia_revision_count,wikipedia_related,features_has_comments,features_has_semantic_indentation,features_has_line_comments,line_comment_token,last_activity,number_of_users,number_of_jobs,origin_community,central_package_repository_count,file_type,is_open_source
java,Java,,pl,1995,James Gosling,https://oracle.com/java/,,,,400,401.0,37.0,0,,,,,,,,,,Java,source.java,programming,java,java jav,11529980,https://en.wikipedia.org/wiki/Java_(programming_language),5242,11543,"""Java is a general-purpose computer programming language that is concurrent, class-based, object-oriented, and specifically designed to have as few implementation dependencies as possible. It is intended to let application developers """"write once","run anywhere"""" (WORA)",meaning that compiled Java code can run on all platforms that support Java without the need for recompilation. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of computer architecture. As of 2016,Java is one of the most popular programming languages in use,particularly for client-server web applications,with a reported 9 million developers. Java was originally developed by James Gosling at Sun Microsystems (which has since been acquired by Oracle Corporation) and released in 1995 as a core component of Sun Microsystems' Java platform. The language derives much of its syntax from C and C++,but it has fewer low-level facilities than either of them. The original and reference implementation Java compilers,virtual machines,and class libraries were originally released by Sun under proprietary licenses. As of May 2007,in compliance with the specifications of the Java Community Process,Sun relicensed most of its Java technologies under the GNU General Public License. Others have also developed alternative implementations of these Sun technologies,such as the GNU Compiler for Java (bytecode compiler),GNU Classpath (standard libraries),and IcedTea-Web (browser plugin for applets). The latest version is Java 9,released on September 21,2017,"and is one of the two versions currently supported for free by Oracle. Versions earlier than Java 8 are supported by companies on a commercial basis; e.g. by Oracle back to Java 6 as of October 2017 (while they still """"highly recommend that you uninstall"""" pre-Java 8 from at least Windows computers)."""
javascript,JavaScript,,pl,1995,Brendan Eich,,,,https://www.w3schools.com/js/js_reserved.asp,349,351.0,48.0,1,,,,,,,,,,JavaScript,source.js,programming,javascript,js _js bones cjs es es6 frag gs jake javascript jsb jscad jsfl jslib jsm jspre jss jsx mjs njs pac sjs ssjs xsjs xsjslib,16046489,https://en.wikipedia.org/wiki/JavaScript,4264,8982,"JavaScript (), often abbreviated as JS, is a high-level, dynamic, weakly typed, prototype-based, multi-paradigm, and interpreted programming language. Alongside HTML and CSS, JavaScript is one of the three core technologies of World Wide Web content production. It is used to make webpages interactive and provide online programs, including video games. The majority of websites employ it, and all modern web browsers support it without the need for plug-ins by means of a built-in JavaScript engine. Each of the many JavaScript engines represent a different implementation of JavaScript, all based on the ECMAScript specification, with some engines not supporting the spec fully, and with many engines supporting additional features beyond ECMA. As a multi-paradigm language, JavaScript supports event-driven, functional, and imperative (including object-oriented and prototype-based) programming styles. It has an API for working with text, arrays, dates, regular expressions, and basic manipulation of the DOM, but the language itself does not include any I/O, such as networking, storage, or graphics facilities, relying for these upon the host environment in which it is embedded. Initially only implemented client-side in web browsers, JavaScript engines are now embedded in many other types of host software, including server-side in web servers and databases, and in non-web programs such as word processors and PDF software, and in runtime environments that make JavaScript available for writing mobile and desktop applications, including desktop widgets. Although there are strong outward similarities between JavaScript and Java, including language name, syntax, and respective standard libraries, the two languages are distinct and differ greatly in design; JavaScript was influenced by programming languages such as Self and Scheme.",9845,1995,2001,6131,java lua scheme perl self c python awk hypertalk actionscript coffeescript dart livescript objective-j opa perl-6 qml typescript json ecmascript html regex pdf tcl c-- vbscript jscript jquery npm-pm mongodb sql max unity-engine google-apps-script objective-c applescript visual-studio-editor asmjs processing oberon smalltalk scala racket llvmir fantom haxe clojure kotlin squeak wasm,TRUE,FALSE,TRUE,//,2022,5962666,63993,Netscape,,text,
c,C,,pl,1972,Dennis Ritchie,,,,http://www.c4learn.com/c-programming/c-keywords/,78,78.0,19.0,2,,,,,,,,,,C,source.c,programming,c_cpp,c cats h idc,2160271,https://en.wikipedia.org/wiki/C_(programming_language),6268,10585,"C (, as in the letter c) is a general-purpose, imperative computer programming language, supporting structured programming, lexical variable scope and recursion, while a static type system prevents many unintended operations. By design, C provides constructs that map efficiently to typical machine instructions, and therefore it has found lasting use in applications that had formerly been coded in assembly language, including operating systems, as well as various application software for computers ranging from supercomputers to embedded systems. C was originally developed by Dennis Ritchie between 1969 and 1973 at Bell Labs, and used to re-implement the Unix operating system. It has since become one of the most widely used programming languages of all time, with C compilers from various vendors available for the majority of existing computer architectures and operating systems. C has been standardized by the American National Standards Institute (ANSI) since 1989 (see ANSI C) and subsequently by the International Organization for Standardization (ISO). C is an imperative procedural language. It was designed to be compiled using a relatively straightforward compiler, to provide low-level access to memory, to provide language constructs that map efficiently to machine instructions, and to require minimal run-time support. Despite its low-level capabilities, the language was designed to encourage cross-platform programming. A standards-compliant and portably written C program can be compiled for a very wide variety of computer platforms and operating systems with few changes to its source code. The language has become available on a very wide range of platforms, from embedded microcontrollers to supercomputers.",6021,2011,2001,7316,cyclone unified-parallel-c split-c cilk b bcpl cpl algol-68 assembly-language pl-i ampl awk c-- csharp objective-c d go java javascript julia limbo lpc perl php pike processing python rust seed7 vala verilog unix algol swift multics unicode fortran pascal mathematica matlab ch smalltalk,TRUE,FALSE,TRUE,//,2022,3793768,59919,Bell Labs,0,text,
python,Python,,pl,1991,Guido van Rossum,https://www.python.org/,python.org,1995,https://www.programiz.com/python-programming/keyword-list,339,342.0,52.0,3,,,,,,,,,,Python,source.python,programming,python,py cgi fcgi gyp gypi lmi py3 pyde pyi pyp pyt pyw rpy smk spec tac wsgi xpy,9300725,https://en.wikipedia.org/wiki/Python_(programming_language),7204,6849,"Python is a widely used high-level programming language for general-purpose programming, created by Guido van Rossum and first released in 1991. An interpreted language, Python has a design philosophy that emphasizes code readability (notably using whitespace indentation to delimit code blocks rather than curly brackets or keywords), and a syntax that allows programmers to express concepts in fewer lines of code than might be used in languages such as C++ or Java. It provides constructs that enable clear programming on both small and large scales. Python features a dynamic type system and automatic memory management. It supports multiple programming paradigms, including object-oriented, imperative, functional and procedural, and has a large and comprehensive standard library. Python interpreters are available for many operating systems. CPython, the reference implementation of Python, is open source software and has a community-based development model, as do nearly all of its variant implementations. CPython is managed by the non-profit Python Software Foundation.",23862,1991,2001,6342,jython micropython stackless-python cython abc algol-68 c dylan haskell icon java lisp modula-3 perl boo cobra coffeescript d f-sharp falcon genie go groovy javascript julia nim ruby swift setl unix unicode standard-ml pascal regex csharp common-lisp scheme objective-c numpy mime http sagemath llvmir jvm java-bytecode cil pyrex mercurial python-for-s60 qt django scipy matplotlib gdb freebsd ecmascript ocaml tcl erlang pandas,TRUE,TRUE,TRUE,#,2022,2818037,46976,Centrum Wiskunde & Informatica,,text,
sql,SQL,,queryLanguage,1974,Donald D. Chamberlin and Raymond F. Boyce,,,,,177,182.0,37.0,4,,,,,,,,,,SQL,source.sql,data,sql,sql cql ddl inc mysql prc tab udf viw,1222,https://en.wikipedia.org/wiki/SQL,3084,4159,"""SQL ( ( listen) ESS-kew-EL or ( listen) SEE-kwəl or SKWEEL, Structured Query Language) is a domain-specific language used in programming and designed for managing data held in a relational database management system (RDBMS), or for stream processing in a relational data stream management system (RDSMS). In comparison to older read/write APIs like ISAM or VSAM, SQL offers two main advantages: first, it introduced the concept of accessing many records with one single command; and second, it eliminates the need to specify how to reach a record, e.g. with or without an index. Originally based upon relational algebra and tuple relational calculus, SQL consists of a data definition language, data manipulation language, and data control language. The scope of SQL includes data insert, query, update and delete, schema creation and modification, and data access control. Although SQL is often described as, and to a great extent is, a declarative language (4GL), it also includes procedural elements. SQL was one of the first commercial languages for Edgar F. Codd's relational model, as described in his influential 1970 paper, """"A Relational Model of Data for Large Shared Data Banks"""". Despite not entirely adhering to the relational model as described by Codd",it became the most widely used database language. SQL became a standard of the American National Standards Institute (ANSI) in 1986,and of the International Organization for Standardization (ISO) in 1987. Since then,the standard has been revised to include a larger set of features. Despite the existence of such standards,"most SQL code is not completely portable among different database systems without adjustments.""",29004,1986,2001,4153,sql-92 datalog linq powershell c sql-psm sqlpl transact-sql mysql pl-sql ada postgresql plpgsql java perl python tcl javascript xml xquery dot-ql isbl quel mumps isbn doi,TRUE,FALSE,TRUE,--,2022,7179119,219617
cpp,C++,,pl,1985,Bjarne Stroustrup,http://isocpp.org/,isocpp.org,2012,,128,128.0,6.0,6,,,,,,,,,,C++,source.c++,programming,c_cpp,cpp c++ cc cp cxx h h++ hh hpp hxx inc inl ino ipp ixx re tcc tpp,2161625,https://en.wikipedia.org/wiki/C++,4307,10943,"C++ ( pronounced cee plus plus) is a general-purpose programming language. It has imperative, object-oriented and generic programming features, while also providing facilities for low-level memory manipulation. It was designed with a bias toward system programming and embedded, resource-constrained and large systems, with performance, efficiency and flexibility of use as its design highlights. C++ has also been found useful in many other contexts, with key strengths being software infrastructure and resource-constrained applications, including desktop applications, servers (e.g. e-commerce, web search or SQL servers), and performance-critical applications (e.g. telephone switches or space probes). C++ is a compiled language, with implementations of it available on many platforms. Many vendors provide C++ compilers, including the Free Software Foundation, Microsoft, Intel, and IBM. C++ is standardized by the International Organization for Standardization (ISO), with the latest standard version ratified and published by ISO in December 2014 as ISO/IEC 14882:2014 (informally known as C++14). The C++ programming language was initially standardized in 1998 as ISO/IEC 14882:1998, which was then amended by the C++03, ISO/IEC 14882:2003, standard. The current C++14 standard supersedes these and C++11, with new features and an enlarged standard library. Before the initial standardization in 1998, C++ was developed by Bjarne Stroustrup at Bell Labs since 1979, as an extension of the C language as he wanted an efficient and flexible language similar to C, which also provided high-level features for program organization. The C++17 standard is due in July 2017, with the draft largely implemented by some compilers already, and C++20 is the next planned standard thereafter. Many other programming languages have been influenced by C++, including C#, D, Java, and newer versions of C.",72038,1998,2001,1487,ada algol-68 c clu ml simula python csharp chapel d java lua perl php rust nim sql bcpl unix assembly-language regex,,,,//,2022,4128238,61098,Bell Labs,0,text,
html,HTML,,textMarkup,1991,Tim Berners-Lee,,,,,111,116.0,7.0,5,,,,,,,,,,HTML,text.html.basic,markup,html,html hta htm htmlhl inc xht xhtml,12648584,https://en.wikipedia.org/wiki/HTML,4897,15298,"Hypertext Markup Language (HTML) is the standard markup language for creating web pages and web applications. With Cascading Style Sheets (CSS) and JavaScript it forms a triad of cornerstone technologies for the World Wide Web. Web browsers receive HTML documents from a web server or from local storage and render them into multimedia web pages. HTML describes the structure of a web page semantically and originally included cues for the appearance of the document. HTML elements are the building blocks of HTML pages. With HTML constructs, images and other objects, such as interactive forms, may be embedded into the rendered page. It provides a means to create structured documents by denoting structural semantics for text such as headings, paragraphs, lists, links, quotes and other items. HTML elements are delineated by tags, written using angle brackets. Tags such as and introduce content into the page directly. Others such as ... surround and provide information about document text and may include other tags as sub-elements. Browsers do not display the HTML tags, but use them to interpret the content of the page. HTML can embed programs written in a scripting language such as JavaScript which affect the behavior and content of web pages. Inclusion of CSS defines the look and layout of content. The World Wide Web Consortium (W3C), maintainer of both the HTML and the CSS standards, has encouraged the use of CSS over explicit presentational HTML since 1997.",13191,1993,2001,7612,sgml css javascript webgl dtd rfc xml unicode utf-8 http fat,TRUE,FALSE,FALSE,,2022,5570873,69531,Conseil Européen pour la Recherche Nucléaire,0,text,
xml,XML,,dataNotation,1996,,,,,,151,151.0,37.0,7,,,,,,,,,,XML,text.xml,data,xml,xml adml admx ant axaml axml builds ccproj ccxml clixml cproject cscfg csdef csl csproj ct depproj dita ditamap ditaval dllconfig dotsettings filters fsproj fxml glade gml gmx grxml gst hzp iml ivy jelly jsproj kml launch mdpolicy mjml mm mod mxml natvis ncl ndproj nproj nuspec odd osm pkgproj pluginspec proj props ps1xml psc1 pt qhelp rdf res resx rs rss sch scxml sfproj shproj srdf storyboard sublime-snippet targets tml ts tsx ui urdf ux vbproj vcxproj vsixmanifest vssettings vstemplate vxml wixproj workflow wsdl wsf wxi wxl wxs x3d xacro xaml xib xlf xliff xmi xmldist xmp xproj xsd xspec xul zcml,3258,https://en.wikipedia.org/wiki/XML,2861,5441,"In computing, Extensible Markup Language (XML) is a markup language that defines a set of rules for encoding documents in a format that is both human-readable and machine-readable. The W3C's XML 1.0 Specification and several other related specifications—all of them free open standards—define XML. The design goals of XML emphasize simplicity, generality, and usability across the Internet. It is a textual data format with strong support via Unicode for different human languages. Although the design of XML focuses on documents, the language is widely used for the representation of arbitrary data structures such as those used in web services. Several schema systems exist to aid in the definition of XML-based languages, while programmers have developed many application programming interfaces (APIs) to aid the processing of XML data.",34138,1996,2001,4025,sgml unicode soap ooxml utf-8 ascii html regex xpath xquery scala java smalltalk php python ecmascript rdf javascript hytime json yaml s-expressions,TRUE,FALSE,FALSE,,2022,1917452,42277,,0,text,
php,PHP,,pl,1995,Rasmus Lerdorf,https://php.net,php.net,1997,http://php.net/manual/en/reserved.keywords.php,269,274.0,26.0,8,https://github.com/php/php-src,33991,7234.0,2022.0,1441,2011.0,The PHP Interpreter,392,1999.0,PHP,text.html.php,programming,php,php aw ctp fcgi inc php3 php4 php5 phps phpt,3479326,https://en.wikipedia.org/wiki/PHP,3151,7839,"PHP is a server-side scripting language designed primarily for web development but also used as a general-purpose programming language. Originally created by Rasmus Lerdorf in 1994, the PHP reference implementation is now produced by The PHP Development Team. PHP originally stood for Personal Home Page, but it now stands for the recursive acronym PHP: Hypertext Preprocessor. PHP code may be embedded into HTML or HTML5 markup, or it can be used in combination with various web template systems, web content management systems and web frameworks. PHP code is usually processed by a PHP interpreter implemented as a module in the web server or as a Common Gateway Interface (CGI) executable. The web server software combines the results of the interpreted and executed PHP code, which may be any type of data, including images, with the generated web page. PHP code may also be executed with a command-line interface (CLI) and can be used to implement standalone graphical applications. The standard PHP interpreter, powered by the Zend Engine, is free software released under the PHP License. PHP has been widely ported and can be deployed on most web servers on almost every operating system and platform, free of charge. The PHP language evolved without a written formal specification or standard until 2014, leaving the canonical PHP interpreter as a de facto standard. Since 2014 work has gone on to create a formal PHP specification.",24131,1994,2001,10104,c hhvm parrot-vm java perl tcl falcon hack html x86-isa unicode wordpress json mysql mime javascript xml parrot-internal-representation cil ftp postgresql sqlite aws java-server-pages linux python mediawiki drupal,TRUE,FALSE,TRUE,//,2022,2356101,30349,Zend,,text,
perl,Perl,,pl,1987,Larry Wall,https://www.perl.org,perl.org,1995,,270,276.0,9.0,9,,,,,,,,,,Perl,source.perl,programming,perl,pl al cgi fcgi perl ph plx pm psgi t,169830,https://en.wikipedia.org/wiki/Perl,1299,4942,"""Perl is a family of high-level, general-purpose, interpreted, dynamic programming languages. The languages in this family include Perl 5 and Perl 6. Though Perl is not officially an acronym, there are various backronyms in use, including """"Practical Extraction and Reporting Language"""". Perl was originally developed by Larry Wall in 1987 as a general-purpose Unix scripting language to make report processing easier. Since then",it has undergone many changes and revisions. Perl 6,which began as a redesign of Perl 5 in 2000,eventually evolved into a separate language. Both languages continue to be developed independently by different development teams and liberally borrow ideas from one another. The Perl languages borrow features from other programming languages including C,shell script (sh),AWK,and sed. They provide powerful text processing facilities without the arbitrary data-length limits of many contemporary Unix commandline tools,facilitating easy manipulation of text files. Perl 5 gained widespread popularity in the late 1990s as a CGI scripting language,in part due to its then unsurpassed regular expression and string parsing abilities. In addition to CGI,Perl 5 is used for system administration,network programming,finance,bioinformatics,and other applications,"such as for GUIs. It has been nicknamed """"the Swiss Army chainsaw of scripting languages"""" because of its flexibility and power",and also its ugliness. In 1998,"it was also referred to as the """"duct tape that holds the Internet together"""""


In [0]:
df.count()

Out[71]: 4303

### Filter dataframe to collect Java title

In [0]:
c_df = df.filter(df.title.contains('C++'))
display(c_df)

pldb_id,title,description,type,appeared,creators,website,domain_name,domain_name_registered,reference,isbndb,book_count,semantic_scholar,language_rank,github_repo,github_repo_stars,github_repo_forks,github_repo_updated,github_repo_subscribers,github_repo_created,github_repo_description,github_repo_issues,github_repo_first_commit,github_language,github_language_tm_scope,github_language_type,github_language_ace_mode,github_language_file_extensions,github_language_repos,wikipedia,wikipedia_daily_page_views,wikipedia_backlinks_count,wikipedia_summary,wikipedia_page_id,wikipedia_appeared,wikipedia_created,wikipedia_revision_count,wikipedia_related,features_has_comments,features_has_semantic_indentation,features_has_line_comments,line_comment_token,last_activity,number_of_users,number_of_jobs,origin_community,central_package_repository_count,file_type,is_open_source
cpp,C++,,pl,1985,Bjarne Stroustrup,http://isocpp.org/,isocpp.org,2012.0,,128.0,128,6.0,6,,,,,,,,,,C++,source.c++,programming,c_cpp,cpp c++ cc cp cxx h h++ hh hpp hxx inc inl ino ipp ixx re tcc tpp,2161625.0,https://en.wikipedia.org/wiki/C++,4307.0,10943.0,"C++ ( pronounced cee plus plus) is a general-purpose programming language. It has imperative, object-oriented and generic programming features, while also providing facilities for low-level memory manipulation. It was designed with a bias toward system programming and embedded, resource-constrained and large systems, with performance, efficiency and flexibility of use as its design highlights. C++ has also been found useful in many other contexts, with key strengths being software infrastructure and resource-constrained applications, including desktop applications, servers (e.g. e-commerce, web search or SQL servers), and performance-critical applications (e.g. telephone switches or space probes). C++ is a compiled language, with implementations of it available on many platforms. Many vendors provide C++ compilers, including the Free Software Foundation, Microsoft, Intel, and IBM. C++ is standardized by the International Organization for Standardization (ISO), with the latest standard version ratified and published by ISO in December 2014 as ISO/IEC 14882:2014 (informally known as C++14). The C++ programming language was initially standardized in 1998 as ISO/IEC 14882:1998, which was then amended by the C++03, ISO/IEC 14882:2003, standard. The current C++14 standard supersedes these and C++11, with new features and an enlarged standard library. Before the initial standardization in 1998, C++ was developed by Bjarne Stroustrup at Bell Labs since 1979, as an extension of the C language as he wanted an efficient and flexible language similar to C, which also provided high-level features for program organization. The C++17 standard is due in July 2017, with the draft largely implemented by some compilers already, and C++20 is the next planned standard thereafter. Many other programming languages have been influenced by C++, including C#, D, Java, and newer versions of C.",72038,1998,2001.0,1487.0,ada algol-68 c clu ml simula python csharp chapel d java lua perl php rust nim sql bcpl unix assembly-language regex,,,,//,2022.0,4128238.0,61098,Bell Labs,0.0,text,
objective-cpp,Objective C++,Objective-C++ is simply source code that mixes Objective-C classes and C++ classes.,pl,1993,,,,,https://en.wikipedia.org/wiki/Objective-C,,0,,787,,,,,,,,,,Objective-C++,source.objc++,programming,objectivec,mm,535669.0,,,,,,,,,,True,False,True,//,2018.0,200.0,0,Apple,0.0,text,
micro-cpp,ΜC++,,pl,1992,pabuhr,https://plg.uwaterloo.ca/usystem/uC++.html,,,,0.0,0,0.0,1539,https://github.com/pabuhr/uCPP,133.0,26.0,2022.0,13.0,2013.0,concurrency for C++,2.0,2013.0,,,,,,,https://en.wikipedia-on-ipfs.org/wiki/%CE%9CC%2B%2B,14.0,22.0,"""μC++, also called uC++, is a programming language, an extension of C++ designed for concurrent programming. Among other features, it adds coroutines, tasks, and monitors, and extends existing language constructs to integrate with them. Its compiler, named u++, operates as a source-to-source translator targeting C++. μC++ is part of the μSystem project, of the University of Waterloo, Ontario, Canada, a large-scale project led by professor Peter Buhr with the goal to create a """"highly-concurrent shared-memory programming system"""".It is used in course CS 343 in University of Waterloo.Every μC++ program should include the uC++.h header file before any other header",although this is not necessary for more recent versions. uC++ is now open source,"available on GitHub.""",3405199.0,2006.0,2005,,,,,,,2022,303,0.0,University of Waterloo,0.0
aspectcpp,AspectC++,,pl,2001,,http://aspectc.org/,aspectc.org,2001.0,,,0,5.0,1790,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2022.0,1.0,0,Various developers,,,
concurrent-cpp,Concurrent C++,,pl,1988,,,,,https://semanticscholar.org/paper/016343974357eac84e053921efc0a33f2f0b2eee,,0,0.0,4119,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1988.0,0.0,0,Bell Laboratories,0.0,,
rcpp,RC++,,pl,2001,,,,,https://semanticscholar.org/paper/18bab5069d96b2d4d4fab9f160c750605f62f2ef,,0,,4290,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2001.0,0.0,0,IKuni Inc && Imperial College London,0.0,,
abcpp,ABC++,,pl,1994,,,,,https://semanticscholar.org/paper/0014195f11078f8e48eba5a9bf1b213d9db28093,,0,,3356,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1994.0,0.0,0,IBM,0.0,,
pcpp,pC++,,pl,1991,,,,,https://semanticscholar.org/paper/f0d0e8e319f4f733d066f6490cee425a2d864d84,,0,,3996,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1991.0,0.0,0,University of Renne && Indiana University,0.0,,


## <span style="color:blue"> 3) Load Data </span>

### Connection to Snowflake

In [0]:
snowOptions = {
    "sfURL" : "https://rpmrbue-yx05377.snowflakecomputing.com",
    "sfUser" : "MAJDIAKRMI",
    "sfPassword" : "Majdi@djo77",
    "sfDatabase" : "K_D_S_ETL",
    "sfSchema" : "LOAD_DATA",
    "sfWarehouse" : "COMPUTE_WH",
    "sfRole" : "accountadmin",
}

SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"

### Write the dataframe to Snowflake

In [0]:
c_df.write \
    .format(SNOWFLAKE_SOURCE_NAME) \
    .options(**snowOptions) \
    .option("dbtable", "C_language") \
    .mode("overwrite") \
    .save()

***<h1 style="text-align: center;">End of the project</h1>***